{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 38475, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001299545159194282, "grad_norm": 41.30098342895508, "learning_rate": 4.9987004548408065e-05, "loss": 4.5353, "step": 10 }, { "epoch": 0.002599090318388564, "grad_norm": 1.443534255027771, "learning_rate": 4.9974009096816114e-05, "loss": 0.7219, "step": 20 }, { "epoch": 0.003898635477582846, "grad_norm": 1.663198471069336, "learning_rate": 4.996101364522418e-05, "loss": 0.2444, "step": 30 }, { "epoch": 0.005198180636777128, "grad_norm": 5.204805850982666, "learning_rate": 4.9948018193632226e-05, "loss": 0.4272, "step": 40 }, { "epoch": 0.00649772579597141, "grad_norm": 0.8349509835243225, "learning_rate": 4.993502274204029e-05, "loss": 0.2702, "step": 50 }, { "epoch": 0.007797270955165692, "grad_norm": 1.51572585105896, "learning_rate": 4.992202729044835e-05, "loss": 0.4692, "step": 60 }, { "epoch": 0.009096816114359974, "grad_norm": 1.4026806354522705, "learning_rate": 4.99090318388564e-05, "loss": 0.5883, "step": 70 }, { "epoch": 0.010396361273554255, "grad_norm": 1.2262136936187744, "learning_rate": 4.989603638726446e-05, "loss": 0.3531, "step": 80 }, { "epoch": 0.011695906432748537, "grad_norm": 0.8910953998565674, "learning_rate": 4.988304093567251e-05, "loss": 0.2972, "step": 90 }, { "epoch": 0.01299545159194282, "grad_norm": 1.5723525285720825, "learning_rate": 4.9870045484080575e-05, "loss": 0.2368, "step": 100 }, { "epoch": 0.014294996751137101, "grad_norm": 0.8725693821907043, "learning_rate": 4.985705003248863e-05, "loss": 0.2708, "step": 110 }, { "epoch": 0.015594541910331383, "grad_norm": 1.2071843147277832, "learning_rate": 4.9844054580896686e-05, "loss": 0.2847, "step": 120 }, { "epoch": 0.016894087069525665, "grad_norm": 1.239086627960205, "learning_rate": 4.983105912930475e-05, "loss": 0.5376, "step": 130 }, { "epoch": 0.018193632228719947, "grad_norm": 0.9031692743301392, "learning_rate": 4.98180636777128e-05, "loss": 0.273, "step": 140 }, { "epoch": 0.01949317738791423, "grad_norm": 0.8972965478897095, "learning_rate": 4.980506822612086e-05, "loss": 0.3191, "step": 150 }, { "epoch": 0.02079272254710851, "grad_norm": 0.7087318301200867, "learning_rate": 4.9792072774528916e-05, "loss": 0.3364, "step": 160 }, { "epoch": 0.022092267706302793, "grad_norm": 0.3314965069293976, "learning_rate": 4.977907732293697e-05, "loss": 0.3047, "step": 170 }, { "epoch": 0.023391812865497075, "grad_norm": 0.5017286539077759, "learning_rate": 4.9766081871345035e-05, "loss": 0.3242, "step": 180 }, { "epoch": 0.024691358024691357, "grad_norm": 0.5897062420845032, "learning_rate": 4.9753086419753084e-05, "loss": 0.2957, "step": 190 }, { "epoch": 0.02599090318388564, "grad_norm": 0.8439343571662903, "learning_rate": 4.974009096816115e-05, "loss": 0.2622, "step": 200 }, { "epoch": 0.02729044834307992, "grad_norm": 0.48378536105155945, "learning_rate": 4.97270955165692e-05, "loss": 0.4035, "step": 210 }, { "epoch": 0.028589993502274202, "grad_norm": 1.2207881212234497, "learning_rate": 4.971410006497726e-05, "loss": 0.3097, "step": 220 }, { "epoch": 0.029889538661468484, "grad_norm": 0.7982758283615112, "learning_rate": 4.970110461338532e-05, "loss": 0.4041, "step": 230 }, { "epoch": 0.031189083820662766, "grad_norm": 0.36489930748939514, "learning_rate": 4.968810916179337e-05, "loss": 0.3005, "step": 240 }, { "epoch": 0.03248862897985705, "grad_norm": 0.657124400138855, "learning_rate": 4.967511371020143e-05, "loss": 0.2509, "step": 250 }, { "epoch": 0.03378817413905133, "grad_norm": 0.6492319703102112, "learning_rate": 4.966211825860949e-05, "loss": 0.2986, "step": 260 }, { "epoch": 0.03508771929824561, "grad_norm": 0.4924551844596863, "learning_rate": 4.9649122807017544e-05, "loss": 0.2879, "step": 270 }, { "epoch": 0.036387264457439894, "grad_norm": 0.9120991826057434, "learning_rate": 4.963612735542561e-05, "loss": 0.3249, "step": 280 }, { "epoch": 0.037686809616634176, "grad_norm": 0.6250187754631042, "learning_rate": 4.9623131903833656e-05, "loss": 0.3749, "step": 290 }, { "epoch": 0.03898635477582846, "grad_norm": 0.39895540475845337, "learning_rate": 4.961013645224172e-05, "loss": 0.2669, "step": 300 }, { "epoch": 0.04028589993502274, "grad_norm": 0.8572816848754883, "learning_rate": 4.9597141000649775e-05, "loss": 0.2789, "step": 310 }, { "epoch": 0.04158544509421702, "grad_norm": 0.3779860734939575, "learning_rate": 4.958414554905783e-05, "loss": 0.2242, "step": 320 }, { "epoch": 0.042884990253411304, "grad_norm": 0.8410694599151611, "learning_rate": 4.957115009746589e-05, "loss": 0.4368, "step": 330 }, { "epoch": 0.044184535412605586, "grad_norm": 0.6876969337463379, "learning_rate": 4.955815464587394e-05, "loss": 0.3666, "step": 340 }, { "epoch": 0.04548408057179987, "grad_norm": 0.48048824071884155, "learning_rate": 4.9545159194282005e-05, "loss": 0.178, "step": 350 }, { "epoch": 0.04678362573099415, "grad_norm": 0.8548028469085693, "learning_rate": 4.953216374269006e-05, "loss": 0.2937, "step": 360 }, { "epoch": 0.04808317089018843, "grad_norm": 0.5617946982383728, "learning_rate": 4.9519168291098116e-05, "loss": 0.183, "step": 370 }, { "epoch": 0.04938271604938271, "grad_norm": 0.49044132232666016, "learning_rate": 4.950617283950618e-05, "loss": 0.3351, "step": 380 }, { "epoch": 0.050682261208576995, "grad_norm": 0.46596279740333557, "learning_rate": 4.949317738791423e-05, "loss": 0.2102, "step": 390 }, { "epoch": 0.05198180636777128, "grad_norm": 0.3272324800491333, "learning_rate": 4.948018193632229e-05, "loss": 0.2364, "step": 400 }, { "epoch": 0.05328135152696556, "grad_norm": 0.6175327301025391, "learning_rate": 4.946718648473035e-05, "loss": 0.2408, "step": 410 }, { "epoch": 0.05458089668615984, "grad_norm": 0.356032133102417, "learning_rate": 4.94541910331384e-05, "loss": 0.2966, "step": 420 }, { "epoch": 0.05588044184535412, "grad_norm": 0.6646308302879333, "learning_rate": 4.9441195581546465e-05, "loss": 0.1797, "step": 430 }, { "epoch": 0.057179987004548405, "grad_norm": 0.20787227153778076, "learning_rate": 4.9428200129954514e-05, "loss": 0.1634, "step": 440 }, { "epoch": 0.05847953216374269, "grad_norm": 0.5891383290290833, "learning_rate": 4.941520467836258e-05, "loss": 0.3527, "step": 450 }, { "epoch": 0.05977907732293697, "grad_norm": 0.298549085855484, "learning_rate": 4.940220922677063e-05, "loss": 0.2861, "step": 460 }, { "epoch": 0.06107862248213125, "grad_norm": 0.3169096112251282, "learning_rate": 4.938921377517869e-05, "loss": 0.2547, "step": 470 }, { "epoch": 0.06237816764132553, "grad_norm": 0.5349656343460083, "learning_rate": 4.937621832358675e-05, "loss": 0.3172, "step": 480 }, { "epoch": 0.06367771280051981, "grad_norm": 0.6160313487052917, "learning_rate": 4.93632228719948e-05, "loss": 0.2642, "step": 490 }, { "epoch": 0.0649772579597141, "grad_norm": 1.0493474006652832, "learning_rate": 4.935022742040286e-05, "loss": 0.3084, "step": 500 }, { "epoch": 0.06627680311890838, "grad_norm": 0.3765532076358795, "learning_rate": 4.933723196881092e-05, "loss": 0.1882, "step": 510 }, { "epoch": 0.06757634827810266, "grad_norm": 0.7683295011520386, "learning_rate": 4.9324236517218975e-05, "loss": 0.2715, "step": 520 }, { "epoch": 0.06887589343729694, "grad_norm": 0.6514467597007751, "learning_rate": 4.931124106562704e-05, "loss": 0.2841, "step": 530 }, { "epoch": 0.07017543859649122, "grad_norm": 0.9201865196228027, "learning_rate": 4.9298245614035086e-05, "loss": 0.2513, "step": 540 }, { "epoch": 0.0714749837556855, "grad_norm": 0.7695584893226624, "learning_rate": 4.928525016244315e-05, "loss": 0.4083, "step": 550 }, { "epoch": 0.07277452891487979, "grad_norm": 1.090301275253296, "learning_rate": 4.9272254710851205e-05, "loss": 0.345, "step": 560 }, { "epoch": 0.07407407407407407, "grad_norm": 0.47511181235313416, "learning_rate": 4.925925925925926e-05, "loss": 0.318, "step": 570 }, { "epoch": 0.07537361923326835, "grad_norm": 0.31254202127456665, "learning_rate": 4.9246263807667317e-05, "loss": 0.2608, "step": 580 }, { "epoch": 0.07667316439246263, "grad_norm": 0.2379496842622757, "learning_rate": 4.923326835607537e-05, "loss": 0.2573, "step": 590 }, { "epoch": 0.07797270955165692, "grad_norm": 0.4698795974254608, "learning_rate": 4.9220272904483435e-05, "loss": 0.2439, "step": 600 }, { "epoch": 0.0792722547108512, "grad_norm": 0.5984089374542236, "learning_rate": 4.920727745289149e-05, "loss": 0.2482, "step": 610 }, { "epoch": 0.08057179987004548, "grad_norm": 0.3086070120334625, "learning_rate": 4.919428200129955e-05, "loss": 0.1835, "step": 620 }, { "epoch": 0.08187134502923976, "grad_norm": 0.3606715202331543, "learning_rate": 4.91812865497076e-05, "loss": 0.2168, "step": 630 }, { "epoch": 0.08317089018843404, "grad_norm": 0.9201058149337769, "learning_rate": 4.916829109811566e-05, "loss": 0.2665, "step": 640 }, { "epoch": 0.08447043534762833, "grad_norm": 0.4693484604358673, "learning_rate": 4.915529564652372e-05, "loss": 0.3286, "step": 650 }, { "epoch": 0.08576998050682261, "grad_norm": 0.484390527009964, "learning_rate": 4.914230019493178e-05, "loss": 0.3097, "step": 660 }, { "epoch": 0.08706952566601689, "grad_norm": 0.6319616436958313, "learning_rate": 4.912930474333983e-05, "loss": 0.3313, "step": 670 }, { "epoch": 0.08836907082521117, "grad_norm": 0.5568938851356506, "learning_rate": 4.911630929174789e-05, "loss": 0.2214, "step": 680 }, { "epoch": 0.08966861598440545, "grad_norm": 0.41651806235313416, "learning_rate": 4.9103313840155945e-05, "loss": 0.2555, "step": 690 }, { "epoch": 0.09096816114359974, "grad_norm": 0.2280704826116562, "learning_rate": 4.909031838856401e-05, "loss": 0.1853, "step": 700 }, { "epoch": 0.09226770630279402, "grad_norm": 0.42000120878219604, "learning_rate": 4.907732293697206e-05, "loss": 0.3306, "step": 710 }, { "epoch": 0.0935672514619883, "grad_norm": 0.6266846060752869, "learning_rate": 4.906432748538012e-05, "loss": 0.2674, "step": 720 }, { "epoch": 0.09486679662118258, "grad_norm": 0.8585039377212524, "learning_rate": 4.9051332033788175e-05, "loss": 0.2679, "step": 730 }, { "epoch": 0.09616634178037686, "grad_norm": 0.504865825176239, "learning_rate": 4.903833658219623e-05, "loss": 0.2436, "step": 740 }, { "epoch": 0.09746588693957114, "grad_norm": 0.6874143481254578, "learning_rate": 4.902534113060429e-05, "loss": 0.2271, "step": 750 }, { "epoch": 0.09876543209876543, "grad_norm": 0.6879366636276245, "learning_rate": 4.901234567901235e-05, "loss": 0.2472, "step": 760 }, { "epoch": 0.10006497725795971, "grad_norm": 0.6926954388618469, "learning_rate": 4.8999350227420405e-05, "loss": 0.4307, "step": 770 }, { "epoch": 0.10136452241715399, "grad_norm": 0.4149816036224365, "learning_rate": 4.898635477582846e-05, "loss": 0.3332, "step": 780 }, { "epoch": 0.10266406757634827, "grad_norm": 1.3162387609481812, "learning_rate": 4.897335932423652e-05, "loss": 0.3699, "step": 790 }, { "epoch": 0.10396361273554255, "grad_norm": 0.4419211447238922, "learning_rate": 4.896036387264458e-05, "loss": 0.2164, "step": 800 }, { "epoch": 0.10526315789473684, "grad_norm": 0.7431888580322266, "learning_rate": 4.8947368421052635e-05, "loss": 0.3532, "step": 810 }, { "epoch": 0.10656270305393112, "grad_norm": 0.5344225764274597, "learning_rate": 4.893437296946069e-05, "loss": 0.3566, "step": 820 }, { "epoch": 0.1078622482131254, "grad_norm": 0.40301278233528137, "learning_rate": 4.892137751786875e-05, "loss": 0.2473, "step": 830 }, { "epoch": 0.10916179337231968, "grad_norm": 0.3268270194530487, "learning_rate": 4.89083820662768e-05, "loss": 0.2995, "step": 840 }, { "epoch": 0.11046133853151396, "grad_norm": 0.5513905882835388, "learning_rate": 4.8895386614684865e-05, "loss": 0.1927, "step": 850 }, { "epoch": 0.11176088369070825, "grad_norm": 0.15016616880893707, "learning_rate": 4.888239116309292e-05, "loss": 0.3256, "step": 860 }, { "epoch": 0.11306042884990253, "grad_norm": 0.6452059745788574, "learning_rate": 4.886939571150098e-05, "loss": 0.3577, "step": 870 }, { "epoch": 0.11435997400909681, "grad_norm": 0.6533651351928711, "learning_rate": 4.885640025990903e-05, "loss": 0.4024, "step": 880 }, { "epoch": 0.11565951916829109, "grad_norm": 0.40710362792015076, "learning_rate": 4.884340480831709e-05, "loss": 0.1966, "step": 890 }, { "epoch": 0.11695906432748537, "grad_norm": 0.486055850982666, "learning_rate": 4.883040935672515e-05, "loss": 0.2462, "step": 900 }, { "epoch": 0.11825860948667966, "grad_norm": 0.9155021905899048, "learning_rate": 4.881741390513321e-05, "loss": 0.3142, "step": 910 }, { "epoch": 0.11955815464587394, "grad_norm": 0.2139640599489212, "learning_rate": 4.880441845354126e-05, "loss": 0.3435, "step": 920 }, { "epoch": 0.12085769980506822, "grad_norm": 0.364255428314209, "learning_rate": 4.879142300194932e-05, "loss": 0.2567, "step": 930 }, { "epoch": 0.1221572449642625, "grad_norm": 0.732996940612793, "learning_rate": 4.8778427550357375e-05, "loss": 0.2367, "step": 940 }, { "epoch": 0.12345679012345678, "grad_norm": 0.7348796129226685, "learning_rate": 4.876543209876544e-05, "loss": 0.3915, "step": 950 }, { "epoch": 0.12475633528265107, "grad_norm": 0.36435467004776, "learning_rate": 4.875243664717349e-05, "loss": 0.283, "step": 960 }, { "epoch": 0.12605588044184535, "grad_norm": 0.4456409513950348, "learning_rate": 4.873944119558155e-05, "loss": 0.1872, "step": 970 }, { "epoch": 0.12735542560103963, "grad_norm": 0.6957472562789917, "learning_rate": 4.8726445743989605e-05, "loss": 0.3688, "step": 980 }, { "epoch": 0.1286549707602339, "grad_norm": 0.4436233341693878, "learning_rate": 4.871345029239766e-05, "loss": 0.2903, "step": 990 }, { "epoch": 0.1299545159194282, "grad_norm": 0.5441693067550659, "learning_rate": 4.8700454840805724e-05, "loss": 0.2441, "step": 1000 }, { "epoch": 0.13125406107862247, "grad_norm": 0.33866769075393677, "learning_rate": 4.868745938921378e-05, "loss": 0.344, "step": 1010 }, { "epoch": 0.13255360623781676, "grad_norm": 0.674666702747345, "learning_rate": 4.8674463937621835e-05, "loss": 0.3585, "step": 1020 }, { "epoch": 0.13385315139701104, "grad_norm": 0.286886990070343, "learning_rate": 4.866146848602989e-05, "loss": 0.2073, "step": 1030 }, { "epoch": 0.13515269655620532, "grad_norm": 0.30971381068229675, "learning_rate": 4.864847303443795e-05, "loss": 0.1869, "step": 1040 }, { "epoch": 0.1364522417153996, "grad_norm": 0.5809982419013977, "learning_rate": 4.863547758284601e-05, "loss": 0.2791, "step": 1050 }, { "epoch": 0.13775178687459388, "grad_norm": 0.3548571467399597, "learning_rate": 4.8622482131254065e-05, "loss": 0.2546, "step": 1060 }, { "epoch": 0.13905133203378817, "grad_norm": 0.4543963372707367, "learning_rate": 4.860948667966212e-05, "loss": 0.2503, "step": 1070 }, { "epoch": 0.14035087719298245, "grad_norm": 0.32342734932899475, "learning_rate": 4.859649122807018e-05, "loss": 0.209, "step": 1080 }, { "epoch": 0.14165042235217673, "grad_norm": 0.6518754959106445, "learning_rate": 4.858349577647823e-05, "loss": 0.3273, "step": 1090 }, { "epoch": 0.142949967511371, "grad_norm": 0.537175178527832, "learning_rate": 4.857050032488629e-05, "loss": 0.1946, "step": 1100 }, { "epoch": 0.1442495126705653, "grad_norm": 0.905509352684021, "learning_rate": 4.855750487329435e-05, "loss": 0.2929, "step": 1110 }, { "epoch": 0.14554905782975958, "grad_norm": 0.3125850558280945, "learning_rate": 4.854450942170241e-05, "loss": 0.2841, "step": 1120 }, { "epoch": 0.14684860298895386, "grad_norm": 0.5911197066307068, "learning_rate": 4.853151397011046e-05, "loss": 0.185, "step": 1130 }, { "epoch": 0.14814814814814814, "grad_norm": 0.7286120057106018, "learning_rate": 4.851851851851852e-05, "loss": 0.2399, "step": 1140 }, { "epoch": 0.14944769330734242, "grad_norm": 0.48923125863075256, "learning_rate": 4.8505523066926575e-05, "loss": 0.2576, "step": 1150 }, { "epoch": 0.1507472384665367, "grad_norm": 0.4319550395011902, "learning_rate": 4.849252761533464e-05, "loss": 0.2889, "step": 1160 }, { "epoch": 0.15204678362573099, "grad_norm": 0.4707675576210022, "learning_rate": 4.847953216374269e-05, "loss": 0.2653, "step": 1170 }, { "epoch": 0.15334632878492527, "grad_norm": 0.5908811688423157, "learning_rate": 4.846653671215075e-05, "loss": 0.2402, "step": 1180 }, { "epoch": 0.15464587394411955, "grad_norm": 0.4428640902042389, "learning_rate": 4.8453541260558805e-05, "loss": 0.2772, "step": 1190 }, { "epoch": 0.15594541910331383, "grad_norm": 0.5300672054290771, "learning_rate": 4.844054580896686e-05, "loss": 0.1922, "step": 1200 }, { "epoch": 0.1572449642625081, "grad_norm": 0.7178001999855042, "learning_rate": 4.8427550357374924e-05, "loss": 0.2357, "step": 1210 }, { "epoch": 0.1585445094217024, "grad_norm": 0.6012532711029053, "learning_rate": 4.841455490578298e-05, "loss": 0.2256, "step": 1220 }, { "epoch": 0.15984405458089668, "grad_norm": 0.3474193811416626, "learning_rate": 4.8401559454191035e-05, "loss": 0.2287, "step": 1230 }, { "epoch": 0.16114359974009096, "grad_norm": 0.4851144552230835, "learning_rate": 4.838856400259909e-05, "loss": 0.2686, "step": 1240 }, { "epoch": 0.16244314489928524, "grad_norm": 0.24471335113048553, "learning_rate": 4.837556855100715e-05, "loss": 0.3264, "step": 1250 }, { "epoch": 0.16374269005847952, "grad_norm": 1.099420189857483, "learning_rate": 4.836257309941521e-05, "loss": 0.2441, "step": 1260 }, { "epoch": 0.1650422352176738, "grad_norm": 0.4516688287258148, "learning_rate": 4.8349577647823265e-05, "loss": 0.1768, "step": 1270 }, { "epoch": 0.1663417803768681, "grad_norm": 0.47558343410491943, "learning_rate": 4.833658219623132e-05, "loss": 0.1292, "step": 1280 }, { "epoch": 0.16764132553606237, "grad_norm": 0.2012924998998642, "learning_rate": 4.832358674463938e-05, "loss": 0.2404, "step": 1290 }, { "epoch": 0.16894087069525665, "grad_norm": 0.8301674723625183, "learning_rate": 4.831059129304743e-05, "loss": 0.2599, "step": 1300 }, { "epoch": 0.17024041585445093, "grad_norm": 0.5828979015350342, "learning_rate": 4.8297595841455496e-05, "loss": 0.2578, "step": 1310 }, { "epoch": 0.17153996101364521, "grad_norm": 0.3859652876853943, "learning_rate": 4.828460038986355e-05, "loss": 0.2517, "step": 1320 }, { "epoch": 0.1728395061728395, "grad_norm": 0.339762806892395, "learning_rate": 4.827160493827161e-05, "loss": 0.2117, "step": 1330 }, { "epoch": 0.17413905133203378, "grad_norm": 0.41186046600341797, "learning_rate": 4.825860948667966e-05, "loss": 0.1405, "step": 1340 }, { "epoch": 0.17543859649122806, "grad_norm": 0.2664526700973511, "learning_rate": 4.824561403508772e-05, "loss": 0.2011, "step": 1350 }, { "epoch": 0.17673814165042234, "grad_norm": 0.6707004308700562, "learning_rate": 4.823261858349578e-05, "loss": 0.2936, "step": 1360 }, { "epoch": 0.17803768680961662, "grad_norm": 0.5659996271133423, "learning_rate": 4.821962313190384e-05, "loss": 0.2481, "step": 1370 }, { "epoch": 0.1793372319688109, "grad_norm": 0.7128964066505432, "learning_rate": 4.8206627680311893e-05, "loss": 0.2873, "step": 1380 }, { "epoch": 0.1806367771280052, "grad_norm": 0.6504437327384949, "learning_rate": 4.819363222871995e-05, "loss": 0.1993, "step": 1390 }, { "epoch": 0.18193632228719947, "grad_norm": 0.6511735320091248, "learning_rate": 4.8180636777128005e-05, "loss": 0.2462, "step": 1400 }, { "epoch": 0.18323586744639375, "grad_norm": 0.518339991569519, "learning_rate": 4.816764132553607e-05, "loss": 0.3242, "step": 1410 }, { "epoch": 0.18453541260558803, "grad_norm": 0.21054671704769135, "learning_rate": 4.8154645873944124e-05, "loss": 0.2433, "step": 1420 }, { "epoch": 0.18583495776478232, "grad_norm": 0.35523542761802673, "learning_rate": 4.814165042235218e-05, "loss": 0.2237, "step": 1430 }, { "epoch": 0.1871345029239766, "grad_norm": 0.4175126850605011, "learning_rate": 4.8128654970760235e-05, "loss": 0.2442, "step": 1440 }, { "epoch": 0.18843404808317088, "grad_norm": 0.8019803762435913, "learning_rate": 4.811565951916829e-05, "loss": 0.2645, "step": 1450 }, { "epoch": 0.18973359324236516, "grad_norm": 0.2513140141963959, "learning_rate": 4.8102664067576354e-05, "loss": 0.1814, "step": 1460 }, { "epoch": 0.19103313840155944, "grad_norm": 0.38294950127601624, "learning_rate": 4.808966861598441e-05, "loss": 0.1862, "step": 1470 }, { "epoch": 0.19233268356075373, "grad_norm": 0.3685075640678406, "learning_rate": 4.8076673164392466e-05, "loss": 0.2407, "step": 1480 }, { "epoch": 0.193632228719948, "grad_norm": 0.17785777151584625, "learning_rate": 4.806367771280052e-05, "loss": 0.2644, "step": 1490 }, { "epoch": 0.1949317738791423, "grad_norm": 0.4091084599494934, "learning_rate": 4.805068226120858e-05, "loss": 0.3602, "step": 1500 }, { "epoch": 0.19623131903833657, "grad_norm": 0.662257730960846, "learning_rate": 4.803768680961664e-05, "loss": 0.2988, "step": 1510 }, { "epoch": 0.19753086419753085, "grad_norm": 0.5170376896858215, "learning_rate": 4.8024691358024696e-05, "loss": 0.3224, "step": 1520 }, { "epoch": 0.19883040935672514, "grad_norm": 0.24961428344249725, "learning_rate": 4.801169590643275e-05, "loss": 0.2322, "step": 1530 }, { "epoch": 0.20012995451591942, "grad_norm": 0.9033471941947937, "learning_rate": 4.799870045484081e-05, "loss": 0.2617, "step": 1540 }, { "epoch": 0.2014294996751137, "grad_norm": 0.5366242527961731, "learning_rate": 4.798570500324886e-05, "loss": 0.2129, "step": 1550 }, { "epoch": 0.20272904483430798, "grad_norm": 0.22565129399299622, "learning_rate": 4.7972709551656926e-05, "loss": 0.2496, "step": 1560 }, { "epoch": 0.20402858999350226, "grad_norm": 0.9439752697944641, "learning_rate": 4.795971410006498e-05, "loss": 0.1899, "step": 1570 }, { "epoch": 0.20532813515269654, "grad_norm": 1.164827585220337, "learning_rate": 4.794671864847304e-05, "loss": 0.2801, "step": 1580 }, { "epoch": 0.20662768031189083, "grad_norm": 0.23354162275791168, "learning_rate": 4.7933723196881094e-05, "loss": 0.3074, "step": 1590 }, { "epoch": 0.2079272254710851, "grad_norm": 0.7572689652442932, "learning_rate": 4.792072774528915e-05, "loss": 0.3407, "step": 1600 }, { "epoch": 0.2092267706302794, "grad_norm": 0.7670542001724243, "learning_rate": 4.790773229369721e-05, "loss": 0.301, "step": 1610 }, { "epoch": 0.21052631578947367, "grad_norm": 0.5419236421585083, "learning_rate": 4.789473684210526e-05, "loss": 0.2198, "step": 1620 }, { "epoch": 0.21182586094866795, "grad_norm": 0.3850794732570648, "learning_rate": 4.7881741390513324e-05, "loss": 0.2328, "step": 1630 }, { "epoch": 0.21312540610786224, "grad_norm": 0.6401352882385254, "learning_rate": 4.786874593892138e-05, "loss": 0.2301, "step": 1640 }, { "epoch": 0.21442495126705652, "grad_norm": 0.4379999041557312, "learning_rate": 4.7855750487329435e-05, "loss": 0.2752, "step": 1650 }, { "epoch": 0.2157244964262508, "grad_norm": 0.2596849501132965, "learning_rate": 4.78427550357375e-05, "loss": 0.1458, "step": 1660 }, { "epoch": 0.21702404158544508, "grad_norm": 0.6725589632987976, "learning_rate": 4.782975958414555e-05, "loss": 0.2529, "step": 1670 }, { "epoch": 0.21832358674463936, "grad_norm": 0.25933966040611267, "learning_rate": 4.781676413255361e-05, "loss": 0.3062, "step": 1680 }, { "epoch": 0.21962313190383365, "grad_norm": 0.22651349008083344, "learning_rate": 4.7803768680961666e-05, "loss": 0.1656, "step": 1690 }, { "epoch": 0.22092267706302793, "grad_norm": 0.2469228059053421, "learning_rate": 4.779077322936972e-05, "loss": 0.2302, "step": 1700 }, { "epoch": 0.2222222222222222, "grad_norm": 0.8071635961532593, "learning_rate": 4.7777777777777784e-05, "loss": 0.1878, "step": 1710 }, { "epoch": 0.2235217673814165, "grad_norm": 0.8754170536994934, "learning_rate": 4.776478232618583e-05, "loss": 0.2477, "step": 1720 }, { "epoch": 0.22482131254061077, "grad_norm": 0.18023255467414856, "learning_rate": 4.7751786874593896e-05, "loss": 0.2098, "step": 1730 }, { "epoch": 0.22612085769980506, "grad_norm": 0.4748021364212036, "learning_rate": 4.773879142300195e-05, "loss": 0.2513, "step": 1740 }, { "epoch": 0.22742040285899934, "grad_norm": 0.8154075145721436, "learning_rate": 4.772579597141001e-05, "loss": 0.1487, "step": 1750 }, { "epoch": 0.22871994801819362, "grad_norm": 0.4774879515171051, "learning_rate": 4.771280051981807e-05, "loss": 0.2236, "step": 1760 }, { "epoch": 0.2300194931773879, "grad_norm": 0.3668650686740875, "learning_rate": 4.769980506822612e-05, "loss": 0.1706, "step": 1770 }, { "epoch": 0.23131903833658218, "grad_norm": 0.4063751697540283, "learning_rate": 4.768680961663418e-05, "loss": 0.2356, "step": 1780 }, { "epoch": 0.23261858349577647, "grad_norm": 0.6622290015220642, "learning_rate": 4.767381416504224e-05, "loss": 0.3144, "step": 1790 }, { "epoch": 0.23391812865497075, "grad_norm": 0.24696584045886993, "learning_rate": 4.7660818713450294e-05, "loss": 0.2863, "step": 1800 }, { "epoch": 0.23521767381416503, "grad_norm": 0.19252406060695648, "learning_rate": 4.7647823261858356e-05, "loss": 0.2985, "step": 1810 }, { "epoch": 0.2365172189733593, "grad_norm": 0.2988499402999878, "learning_rate": 4.7634827810266405e-05, "loss": 0.2339, "step": 1820 }, { "epoch": 0.2378167641325536, "grad_norm": 0.3648994565010071, "learning_rate": 4.762183235867447e-05, "loss": 0.2316, "step": 1830 }, { "epoch": 0.23911630929174787, "grad_norm": 1.716273546218872, "learning_rate": 4.7608836907082524e-05, "loss": 0.4105, "step": 1840 }, { "epoch": 0.24041585445094216, "grad_norm": 0.5278337001800537, "learning_rate": 4.759584145549058e-05, "loss": 0.2654, "step": 1850 }, { "epoch": 0.24171539961013644, "grad_norm": 0.957542359828949, "learning_rate": 4.758284600389864e-05, "loss": 0.3482, "step": 1860 }, { "epoch": 0.24301494476933072, "grad_norm": 0.4067757725715637, "learning_rate": 4.756985055230669e-05, "loss": 0.3122, "step": 1870 }, { "epoch": 0.244314489928525, "grad_norm": 0.476656049489975, "learning_rate": 4.7556855100714754e-05, "loss": 0.2344, "step": 1880 }, { "epoch": 0.24561403508771928, "grad_norm": 0.44574519991874695, "learning_rate": 4.754385964912281e-05, "loss": 0.2605, "step": 1890 }, { "epoch": 0.24691358024691357, "grad_norm": 0.4272555708885193, "learning_rate": 4.7530864197530866e-05, "loss": 0.1565, "step": 1900 }, { "epoch": 0.24821312540610785, "grad_norm": 0.42834383249282837, "learning_rate": 4.751786874593893e-05, "loss": 0.1692, "step": 1910 }, { "epoch": 0.24951267056530213, "grad_norm": 0.4081329107284546, "learning_rate": 4.750487329434698e-05, "loss": 0.2374, "step": 1920 }, { "epoch": 0.2508122157244964, "grad_norm": 0.22699742019176483, "learning_rate": 4.749187784275504e-05, "loss": 0.2313, "step": 1930 }, { "epoch": 0.2521117608836907, "grad_norm": 0.5505887269973755, "learning_rate": 4.7478882391163096e-05, "loss": 0.2486, "step": 1940 }, { "epoch": 0.253411306042885, "grad_norm": 0.8939064145088196, "learning_rate": 4.746588693957115e-05, "loss": 0.2941, "step": 1950 }, { "epoch": 0.25471085120207926, "grad_norm": 0.5881379842758179, "learning_rate": 4.7452891487979214e-05, "loss": 0.2372, "step": 1960 }, { "epoch": 0.25601039636127354, "grad_norm": 0.22253429889678955, "learning_rate": 4.7439896036387263e-05, "loss": 0.2065, "step": 1970 }, { "epoch": 0.2573099415204678, "grad_norm": 0.6524843573570251, "learning_rate": 4.7426900584795326e-05, "loss": 0.3652, "step": 1980 }, { "epoch": 0.2586094866796621, "grad_norm": 0.24652263522148132, "learning_rate": 4.741390513320338e-05, "loss": 0.2722, "step": 1990 }, { "epoch": 0.2599090318388564, "grad_norm": 0.24468880891799927, "learning_rate": 4.740090968161144e-05, "loss": 0.2359, "step": 2000 }, { "epoch": 0.26120857699805067, "grad_norm": 0.39338287711143494, "learning_rate": 4.73879142300195e-05, "loss": 0.2423, "step": 2010 }, { "epoch": 0.26250812215724495, "grad_norm": 1.0336627960205078, "learning_rate": 4.737491877842755e-05, "loss": 0.1927, "step": 2020 }, { "epoch": 0.26380766731643923, "grad_norm": 0.6400977969169617, "learning_rate": 4.736192332683561e-05, "loss": 0.1919, "step": 2030 }, { "epoch": 0.2651072124756335, "grad_norm": 0.4477030634880066, "learning_rate": 4.734892787524367e-05, "loss": 0.2864, "step": 2040 }, { "epoch": 0.2664067576348278, "grad_norm": 0.2494133859872818, "learning_rate": 4.7335932423651724e-05, "loss": 0.2059, "step": 2050 }, { "epoch": 0.2677063027940221, "grad_norm": 0.3210766911506653, "learning_rate": 4.7322936972059787e-05, "loss": 0.1555, "step": 2060 }, { "epoch": 0.26900584795321636, "grad_norm": 0.33700668811798096, "learning_rate": 4.7309941520467836e-05, "loss": 0.2401, "step": 2070 }, { "epoch": 0.27030539311241064, "grad_norm": 0.5000825524330139, "learning_rate": 4.72969460688759e-05, "loss": 0.2093, "step": 2080 }, { "epoch": 0.2716049382716049, "grad_norm": 0.38857951760292053, "learning_rate": 4.7283950617283954e-05, "loss": 0.2286, "step": 2090 }, { "epoch": 0.2729044834307992, "grad_norm": 1.613774061203003, "learning_rate": 4.727095516569201e-05, "loss": 0.1869, "step": 2100 }, { "epoch": 0.2742040285899935, "grad_norm": 0.34125882387161255, "learning_rate": 4.725795971410007e-05, "loss": 0.1737, "step": 2110 }, { "epoch": 0.27550357374918777, "grad_norm": 0.38936132192611694, "learning_rate": 4.724496426250812e-05, "loss": 0.2327, "step": 2120 }, { "epoch": 0.27680311890838205, "grad_norm": 0.6074032783508301, "learning_rate": 4.7231968810916184e-05, "loss": 0.2244, "step": 2130 }, { "epoch": 0.27810266406757633, "grad_norm": 0.26510998606681824, "learning_rate": 4.721897335932423e-05, "loss": 0.2048, "step": 2140 }, { "epoch": 0.2794022092267706, "grad_norm": 0.35818949341773987, "learning_rate": 4.7205977907732296e-05, "loss": 0.1778, "step": 2150 }, { "epoch": 0.2807017543859649, "grad_norm": 0.4453869163990021, "learning_rate": 4.719298245614036e-05, "loss": 0.2248, "step": 2160 }, { "epoch": 0.2820012995451592, "grad_norm": 0.7275928854942322, "learning_rate": 4.717998700454841e-05, "loss": 0.2166, "step": 2170 }, { "epoch": 0.28330084470435346, "grad_norm": 0.3994103670120239, "learning_rate": 4.716699155295647e-05, "loss": 0.2205, "step": 2180 }, { "epoch": 0.28460038986354774, "grad_norm": 0.5071271657943726, "learning_rate": 4.715399610136452e-05, "loss": 0.2866, "step": 2190 }, { "epoch": 0.285899935022742, "grad_norm": 0.4845096468925476, "learning_rate": 4.714100064977258e-05, "loss": 0.2231, "step": 2200 }, { "epoch": 0.2871994801819363, "grad_norm": 0.6157258152961731, "learning_rate": 4.7128005198180645e-05, "loss": 0.2017, "step": 2210 }, { "epoch": 0.2884990253411306, "grad_norm": 0.26529213786125183, "learning_rate": 4.7115009746588694e-05, "loss": 0.203, "step": 2220 }, { "epoch": 0.28979857050032487, "grad_norm": 0.4161243736743927, "learning_rate": 4.7102014294996756e-05, "loss": 0.251, "step": 2230 }, { "epoch": 0.29109811565951915, "grad_norm": 0.5243601202964783, "learning_rate": 4.7089018843404805e-05, "loss": 0.1622, "step": 2240 }, { "epoch": 0.29239766081871343, "grad_norm": 0.6494327187538147, "learning_rate": 4.707602339181287e-05, "loss": 0.3065, "step": 2250 }, { "epoch": 0.2936972059779077, "grad_norm": 0.39275476336479187, "learning_rate": 4.7063027940220924e-05, "loss": 0.3258, "step": 2260 }, { "epoch": 0.294996751137102, "grad_norm": 0.5810893774032593, "learning_rate": 4.705003248862898e-05, "loss": 0.2265, "step": 2270 }, { "epoch": 0.2962962962962963, "grad_norm": 0.14503473043441772, "learning_rate": 4.703703703703704e-05, "loss": 0.1594, "step": 2280 }, { "epoch": 0.29759584145549056, "grad_norm": 0.3469076454639435, "learning_rate": 4.702404158544509e-05, "loss": 0.1934, "step": 2290 }, { "epoch": 0.29889538661468484, "grad_norm": 0.4732000231742859, "learning_rate": 4.7011046133853154e-05, "loss": 0.2288, "step": 2300 }, { "epoch": 0.3001949317738791, "grad_norm": 0.5140984654426575, "learning_rate": 4.699805068226121e-05, "loss": 0.2038, "step": 2310 }, { "epoch": 0.3014944769330734, "grad_norm": 0.8638989329338074, "learning_rate": 4.6985055230669266e-05, "loss": 0.3046, "step": 2320 }, { "epoch": 0.3027940220922677, "grad_norm": 0.269879549741745, "learning_rate": 4.697205977907733e-05, "loss": 0.2667, "step": 2330 }, { "epoch": 0.30409356725146197, "grad_norm": 0.28844526410102844, "learning_rate": 4.695906432748538e-05, "loss": 0.2479, "step": 2340 }, { "epoch": 0.30539311241065625, "grad_norm": 0.8159250020980835, "learning_rate": 4.694606887589344e-05, "loss": 0.2194, "step": 2350 }, { "epoch": 0.30669265756985054, "grad_norm": 0.3125803470611572, "learning_rate": 4.6933073424301496e-05, "loss": 0.2601, "step": 2360 }, { "epoch": 0.3079922027290448, "grad_norm": 0.555450975894928, "learning_rate": 4.692007797270955e-05, "loss": 0.2247, "step": 2370 }, { "epoch": 0.3092917478882391, "grad_norm": 0.3757578134536743, "learning_rate": 4.6907082521117615e-05, "loss": 0.2797, "step": 2380 }, { "epoch": 0.3105912930474334, "grad_norm": 0.5351017713546753, "learning_rate": 4.6894087069525664e-05, "loss": 0.285, "step": 2390 }, { "epoch": 0.31189083820662766, "grad_norm": 0.4210439622402191, "learning_rate": 4.6881091617933726e-05, "loss": 0.2004, "step": 2400 }, { "epoch": 0.31319038336582194, "grad_norm": 0.6449156999588013, "learning_rate": 4.686809616634178e-05, "loss": 0.272, "step": 2410 }, { "epoch": 0.3144899285250162, "grad_norm": 0.47756069898605347, "learning_rate": 4.685510071474984e-05, "loss": 0.2255, "step": 2420 }, { "epoch": 0.3157894736842105, "grad_norm": 0.44169968366622925, "learning_rate": 4.68421052631579e-05, "loss": 0.1884, "step": 2430 }, { "epoch": 0.3170890188434048, "grad_norm": 0.2089644968509674, "learning_rate": 4.682910981156595e-05, "loss": 0.1632, "step": 2440 }, { "epoch": 0.3183885640025991, "grad_norm": 0.603034496307373, "learning_rate": 4.681611435997401e-05, "loss": 0.2314, "step": 2450 }, { "epoch": 0.31968810916179335, "grad_norm": 0.29518869519233704, "learning_rate": 4.680311890838207e-05, "loss": 0.233, "step": 2460 }, { "epoch": 0.32098765432098764, "grad_norm": 0.37991613149642944, "learning_rate": 4.6790123456790124e-05, "loss": 0.1931, "step": 2470 }, { "epoch": 0.3222871994801819, "grad_norm": 0.5938796401023865, "learning_rate": 4.677712800519819e-05, "loss": 0.1814, "step": 2480 }, { "epoch": 0.3235867446393762, "grad_norm": 0.8949798345565796, "learning_rate": 4.6764132553606236e-05, "loss": 0.3035, "step": 2490 }, { "epoch": 0.3248862897985705, "grad_norm": 0.5152238011360168, "learning_rate": 4.67511371020143e-05, "loss": 0.2711, "step": 2500 }, { "epoch": 0.32618583495776476, "grad_norm": 0.26268821954727173, "learning_rate": 4.6738141650422354e-05, "loss": 0.2586, "step": 2510 }, { "epoch": 0.32748538011695905, "grad_norm": 0.32130828499794006, "learning_rate": 4.672514619883041e-05, "loss": 0.2004, "step": 2520 }, { "epoch": 0.32878492527615333, "grad_norm": 0.46837255358695984, "learning_rate": 4.671215074723847e-05, "loss": 0.1901, "step": 2530 }, { "epoch": 0.3300844704353476, "grad_norm": 0.3019028306007385, "learning_rate": 4.669915529564652e-05, "loss": 0.1971, "step": 2540 }, { "epoch": 0.3313840155945419, "grad_norm": 0.34287846088409424, "learning_rate": 4.6686159844054584e-05, "loss": 0.1928, "step": 2550 }, { "epoch": 0.3326835607537362, "grad_norm": 0.1920398771762848, "learning_rate": 4.667316439246264e-05, "loss": 0.2464, "step": 2560 }, { "epoch": 0.33398310591293046, "grad_norm": 0.2999134063720703, "learning_rate": 4.6660168940870696e-05, "loss": 0.2224, "step": 2570 }, { "epoch": 0.33528265107212474, "grad_norm": 0.2557377219200134, "learning_rate": 4.664717348927876e-05, "loss": 0.3075, "step": 2580 }, { "epoch": 0.336582196231319, "grad_norm": 0.48501086235046387, "learning_rate": 4.663417803768681e-05, "loss": 0.1867, "step": 2590 }, { "epoch": 0.3378817413905133, "grad_norm": 0.8913998007774353, "learning_rate": 4.662118258609487e-05, "loss": 0.294, "step": 2600 }, { "epoch": 0.3391812865497076, "grad_norm": 0.29135337471961975, "learning_rate": 4.6608187134502926e-05, "loss": 0.2287, "step": 2610 }, { "epoch": 0.34048083170890187, "grad_norm": 0.3279026746749878, "learning_rate": 4.659519168291098e-05, "loss": 0.1786, "step": 2620 }, { "epoch": 0.34178037686809615, "grad_norm": 0.22055476903915405, "learning_rate": 4.6582196231319045e-05, "loss": 0.2092, "step": 2630 }, { "epoch": 0.34307992202729043, "grad_norm": 0.18884806334972382, "learning_rate": 4.6569200779727094e-05, "loss": 0.2614, "step": 2640 }, { "epoch": 0.3443794671864847, "grad_norm": 0.5672394037246704, "learning_rate": 4.6556205328135157e-05, "loss": 0.221, "step": 2650 }, { "epoch": 0.345679012345679, "grad_norm": 0.30130288004875183, "learning_rate": 4.654320987654321e-05, "loss": 0.2808, "step": 2660 }, { "epoch": 0.3469785575048733, "grad_norm": 0.3306979537010193, "learning_rate": 4.653021442495127e-05, "loss": 0.1929, "step": 2670 }, { "epoch": 0.34827810266406756, "grad_norm": 0.6083114743232727, "learning_rate": 4.651721897335933e-05, "loss": 0.3199, "step": 2680 }, { "epoch": 0.34957764782326184, "grad_norm": 0.3411419689655304, "learning_rate": 4.650422352176738e-05, "loss": 0.2737, "step": 2690 }, { "epoch": 0.3508771929824561, "grad_norm": 0.3940567076206207, "learning_rate": 4.649122807017544e-05, "loss": 0.2762, "step": 2700 }, { "epoch": 0.3521767381416504, "grad_norm": 0.3637632131576538, "learning_rate": 4.64782326185835e-05, "loss": 0.2554, "step": 2710 }, { "epoch": 0.3534762833008447, "grad_norm": 0.6088961958885193, "learning_rate": 4.6465237166991554e-05, "loss": 0.2467, "step": 2720 }, { "epoch": 0.35477582846003897, "grad_norm": 0.20121929049491882, "learning_rate": 4.645224171539962e-05, "loss": 0.1742, "step": 2730 }, { "epoch": 0.35607537361923325, "grad_norm": 0.21762144565582275, "learning_rate": 4.6439246263807666e-05, "loss": 0.206, "step": 2740 }, { "epoch": 0.35737491877842753, "grad_norm": 0.547901451587677, "learning_rate": 4.642625081221573e-05, "loss": 0.2283, "step": 2750 }, { "epoch": 0.3586744639376218, "grad_norm": 0.5878198146820068, "learning_rate": 4.6413255360623785e-05, "loss": 0.2834, "step": 2760 }, { "epoch": 0.3599740090968161, "grad_norm": 0.20595768094062805, "learning_rate": 4.640025990903184e-05, "loss": 0.2452, "step": 2770 }, { "epoch": 0.3612735542560104, "grad_norm": 0.752597987651825, "learning_rate": 4.6387264457439896e-05, "loss": 0.2919, "step": 2780 }, { "epoch": 0.36257309941520466, "grad_norm": 0.4376462996006012, "learning_rate": 4.637426900584795e-05, "loss": 0.1705, "step": 2790 }, { "epoch": 0.36387264457439894, "grad_norm": 0.273580938577652, "learning_rate": 4.6361273554256015e-05, "loss": 0.3331, "step": 2800 }, { "epoch": 0.3651721897335932, "grad_norm": 0.22732378542423248, "learning_rate": 4.634827810266407e-05, "loss": 0.2104, "step": 2810 }, { "epoch": 0.3664717348927875, "grad_norm": 0.7806446552276611, "learning_rate": 4.6335282651072126e-05, "loss": 0.2365, "step": 2820 }, { "epoch": 0.3677712800519818, "grad_norm": 0.8364452123641968, "learning_rate": 4.632228719948018e-05, "loss": 0.2632, "step": 2830 }, { "epoch": 0.36907082521117607, "grad_norm": 0.7476444840431213, "learning_rate": 4.630929174788824e-05, "loss": 0.2371, "step": 2840 }, { "epoch": 0.37037037037037035, "grad_norm": 0.6854220032691956, "learning_rate": 4.62962962962963e-05, "loss": 0.2588, "step": 2850 }, { "epoch": 0.37166991552956463, "grad_norm": 0.40119168162345886, "learning_rate": 4.628330084470436e-05, "loss": 0.2837, "step": 2860 }, { "epoch": 0.3729694606887589, "grad_norm": 0.4324892461299896, "learning_rate": 4.627030539311241e-05, "loss": 0.2192, "step": 2870 }, { "epoch": 0.3742690058479532, "grad_norm": 0.39105966687202454, "learning_rate": 4.625730994152047e-05, "loss": 0.2467, "step": 2880 }, { "epoch": 0.3755685510071475, "grad_norm": 0.7237193584442139, "learning_rate": 4.6244314489928524e-05, "loss": 0.2017, "step": 2890 }, { "epoch": 0.37686809616634176, "grad_norm": 0.270600289106369, "learning_rate": 4.623131903833659e-05, "loss": 0.1963, "step": 2900 }, { "epoch": 0.37816764132553604, "grad_norm": 0.26706230640411377, "learning_rate": 4.621832358674464e-05, "loss": 0.1791, "step": 2910 }, { "epoch": 0.3794671864847303, "grad_norm": 0.3888351619243622, "learning_rate": 4.62053281351527e-05, "loss": 0.2289, "step": 2920 }, { "epoch": 0.3807667316439246, "grad_norm": 1.1346228122711182, "learning_rate": 4.6192332683560754e-05, "loss": 0.3698, "step": 2930 }, { "epoch": 0.3820662768031189, "grad_norm": 0.2916838228702545, "learning_rate": 4.617933723196881e-05, "loss": 0.2336, "step": 2940 }, { "epoch": 0.38336582196231317, "grad_norm": 0.5018054246902466, "learning_rate": 4.616634178037687e-05, "loss": 0.305, "step": 2950 }, { "epoch": 0.38466536712150745, "grad_norm": 0.47869592905044556, "learning_rate": 4.615334632878493e-05, "loss": 0.2318, "step": 2960 }, { "epoch": 0.38596491228070173, "grad_norm": 0.6554490923881531, "learning_rate": 4.6140350877192985e-05, "loss": 0.2284, "step": 2970 }, { "epoch": 0.387264457439896, "grad_norm": 0.3824814260005951, "learning_rate": 4.612735542560104e-05, "loss": 0.2238, "step": 2980 }, { "epoch": 0.3885640025990903, "grad_norm": 0.6530680060386658, "learning_rate": 4.6114359974009096e-05, "loss": 0.2175, "step": 2990 }, { "epoch": 0.3898635477582846, "grad_norm": 0.6266106963157654, "learning_rate": 4.610136452241716e-05, "loss": 0.6195, "step": 3000 }, { "epoch": 0.39116309291747886, "grad_norm": 1.6227953433990479, "learning_rate": 4.6088369070825215e-05, "loss": 0.2343, "step": 3010 }, { "epoch": 0.39246263807667314, "grad_norm": 0.3527167737483978, "learning_rate": 4.607537361923327e-05, "loss": 0.2799, "step": 3020 }, { "epoch": 0.3937621832358674, "grad_norm": 0.4052543640136719, "learning_rate": 4.6062378167641327e-05, "loss": 0.2542, "step": 3030 }, { "epoch": 0.3950617283950617, "grad_norm": 0.27563992142677307, "learning_rate": 4.604938271604938e-05, "loss": 0.2425, "step": 3040 }, { "epoch": 0.396361273554256, "grad_norm": 0.5449113845825195, "learning_rate": 4.6036387264457445e-05, "loss": 0.1734, "step": 3050 }, { "epoch": 0.39766081871345027, "grad_norm": 0.9767820835113525, "learning_rate": 4.60233918128655e-05, "loss": 0.2262, "step": 3060 }, { "epoch": 0.39896036387264455, "grad_norm": 0.34554794430732727, "learning_rate": 4.601039636127356e-05, "loss": 0.1758, "step": 3070 }, { "epoch": 0.40025990903183883, "grad_norm": 0.37316933274269104, "learning_rate": 4.599740090968161e-05, "loss": 0.279, "step": 3080 }, { "epoch": 0.4015594541910331, "grad_norm": 0.6593908667564392, "learning_rate": 4.598440545808967e-05, "loss": 0.1904, "step": 3090 }, { "epoch": 0.4028589993502274, "grad_norm": 0.5440442562103271, "learning_rate": 4.597141000649773e-05, "loss": 0.259, "step": 3100 }, { "epoch": 0.4041585445094217, "grad_norm": 0.8328630924224854, "learning_rate": 4.595841455490579e-05, "loss": 0.2357, "step": 3110 }, { "epoch": 0.40545808966861596, "grad_norm": 0.8768796920776367, "learning_rate": 4.594541910331384e-05, "loss": 0.2049, "step": 3120 }, { "epoch": 0.40675763482781024, "grad_norm": 0.849868893623352, "learning_rate": 4.59324236517219e-05, "loss": 0.2969, "step": 3130 }, { "epoch": 0.4080571799870045, "grad_norm": 0.5945208072662354, "learning_rate": 4.5919428200129954e-05, "loss": 0.2679, "step": 3140 }, { "epoch": 0.4093567251461988, "grad_norm": 0.7569746971130371, "learning_rate": 4.590643274853802e-05, "loss": 0.2858, "step": 3150 }, { "epoch": 0.4106562703053931, "grad_norm": 0.5134261846542358, "learning_rate": 4.589343729694607e-05, "loss": 0.1997, "step": 3160 }, { "epoch": 0.41195581546458737, "grad_norm": 0.5572426915168762, "learning_rate": 4.588044184535413e-05, "loss": 0.225, "step": 3170 }, { "epoch": 0.41325536062378165, "grad_norm": 0.3433803617954254, "learning_rate": 4.5867446393762185e-05, "loss": 0.1561, "step": 3180 }, { "epoch": 0.41455490578297594, "grad_norm": 0.5421501398086548, "learning_rate": 4.585445094217024e-05, "loss": 0.2823, "step": 3190 }, { "epoch": 0.4158544509421702, "grad_norm": 1.3089476823806763, "learning_rate": 4.58414554905783e-05, "loss": 0.301, "step": 3200 }, { "epoch": 0.4171539961013645, "grad_norm": 0.4591807425022125, "learning_rate": 4.582846003898636e-05, "loss": 0.2729, "step": 3210 }, { "epoch": 0.4184535412605588, "grad_norm": 0.6115627288818359, "learning_rate": 4.5815464587394415e-05, "loss": 0.1864, "step": 3220 }, { "epoch": 0.41975308641975306, "grad_norm": 0.4011402428150177, "learning_rate": 4.580246913580247e-05, "loss": 0.2396, "step": 3230 }, { "epoch": 0.42105263157894735, "grad_norm": 0.2887427508831024, "learning_rate": 4.5789473684210527e-05, "loss": 0.326, "step": 3240 }, { "epoch": 0.4223521767381416, "grad_norm": 0.3698146939277649, "learning_rate": 4.577647823261859e-05, "loss": 0.3012, "step": 3250 }, { "epoch": 0.4236517218973359, "grad_norm": 0.2912067472934723, "learning_rate": 4.5763482781026645e-05, "loss": 0.2443, "step": 3260 }, { "epoch": 0.4249512670565302, "grad_norm": 0.39996960759162903, "learning_rate": 4.57504873294347e-05, "loss": 0.2161, "step": 3270 }, { "epoch": 0.4262508122157245, "grad_norm": 0.4405180513858795, "learning_rate": 4.573749187784276e-05, "loss": 0.1998, "step": 3280 }, { "epoch": 0.42755035737491875, "grad_norm": 0.33486393094062805, "learning_rate": 4.572449642625081e-05, "loss": 0.2397, "step": 3290 }, { "epoch": 0.42884990253411304, "grad_norm": 0.6742551326751709, "learning_rate": 4.571150097465887e-05, "loss": 0.2091, "step": 3300 }, { "epoch": 0.4301494476933073, "grad_norm": 0.6602084636688232, "learning_rate": 4.569850552306693e-05, "loss": 0.2644, "step": 3310 }, { "epoch": 0.4314489928525016, "grad_norm": 0.3300883173942566, "learning_rate": 4.568551007147499e-05, "loss": 0.2014, "step": 3320 }, { "epoch": 0.4327485380116959, "grad_norm": 0.6733124256134033, "learning_rate": 4.567251461988304e-05, "loss": 0.2358, "step": 3330 }, { "epoch": 0.43404808317089016, "grad_norm": 0.2688356041908264, "learning_rate": 4.56595191682911e-05, "loss": 0.1973, "step": 3340 }, { "epoch": 0.43534762833008445, "grad_norm": 0.3702999949455261, "learning_rate": 4.5646523716699155e-05, "loss": 0.3211, "step": 3350 }, { "epoch": 0.43664717348927873, "grad_norm": 0.47636425495147705, "learning_rate": 4.563352826510722e-05, "loss": 0.1861, "step": 3360 }, { "epoch": 0.437946718648473, "grad_norm": 0.3395305573940277, "learning_rate": 4.562053281351527e-05, "loss": 0.2165, "step": 3370 }, { "epoch": 0.4392462638076673, "grad_norm": 0.6291455626487732, "learning_rate": 4.560753736192333e-05, "loss": 0.2989, "step": 3380 }, { "epoch": 0.4405458089668616, "grad_norm": 0.2812843918800354, "learning_rate": 4.5594541910331385e-05, "loss": 0.1917, "step": 3390 }, { "epoch": 0.44184535412605586, "grad_norm": 0.4697757661342621, "learning_rate": 4.558154645873944e-05, "loss": 0.2278, "step": 3400 }, { "epoch": 0.44314489928525014, "grad_norm": 0.253589928150177, "learning_rate": 4.55685510071475e-05, "loss": 0.2684, "step": 3410 }, { "epoch": 0.4444444444444444, "grad_norm": 0.4707001745700836, "learning_rate": 4.555555555555556e-05, "loss": 0.2302, "step": 3420 }, { "epoch": 0.4457439896036387, "grad_norm": 0.502855122089386, "learning_rate": 4.5542560103963615e-05, "loss": 0.1899, "step": 3430 }, { "epoch": 0.447043534762833, "grad_norm": 0.29601600766181946, "learning_rate": 4.552956465237167e-05, "loss": 0.1861, "step": 3440 }, { "epoch": 0.44834307992202727, "grad_norm": 0.29489901661872864, "learning_rate": 4.551656920077973e-05, "loss": 0.2067, "step": 3450 }, { "epoch": 0.44964262508122155, "grad_norm": 0.5373746752738953, "learning_rate": 4.550357374918779e-05, "loss": 0.2148, "step": 3460 }, { "epoch": 0.45094217024041583, "grad_norm": 0.7914173603057861, "learning_rate": 4.5490578297595845e-05, "loss": 0.2244, "step": 3470 }, { "epoch": 0.4522417153996101, "grad_norm": 0.537763237953186, "learning_rate": 4.54775828460039e-05, "loss": 0.2807, "step": 3480 }, { "epoch": 0.4535412605588044, "grad_norm": 0.4721389412879944, "learning_rate": 4.546458739441196e-05, "loss": 0.2701, "step": 3490 }, { "epoch": 0.4548408057179987, "grad_norm": 0.33371633291244507, "learning_rate": 4.545159194282001e-05, "loss": 0.2455, "step": 3500 }, { "epoch": 0.45614035087719296, "grad_norm": 0.517349898815155, "learning_rate": 4.5438596491228075e-05, "loss": 0.2693, "step": 3510 }, { "epoch": 0.45743989603638724, "grad_norm": 0.5011960864067078, "learning_rate": 4.542560103963613e-05, "loss": 0.2489, "step": 3520 }, { "epoch": 0.4587394411955815, "grad_norm": 0.29762643575668335, "learning_rate": 4.541260558804419e-05, "loss": 0.2374, "step": 3530 }, { "epoch": 0.4600389863547758, "grad_norm": 0.36133304238319397, "learning_rate": 4.539961013645224e-05, "loss": 0.2162, "step": 3540 }, { "epoch": 0.4613385315139701, "grad_norm": 0.4475012719631195, "learning_rate": 4.53866146848603e-05, "loss": 0.2012, "step": 3550 }, { "epoch": 0.46263807667316437, "grad_norm": 0.2771393656730652, "learning_rate": 4.537361923326836e-05, "loss": 0.2032, "step": 3560 }, { "epoch": 0.46393762183235865, "grad_norm": 0.43835747241973877, "learning_rate": 4.536062378167642e-05, "loss": 0.212, "step": 3570 }, { "epoch": 0.46523716699155293, "grad_norm": 0.7158582806587219, "learning_rate": 4.534762833008447e-05, "loss": 0.1811, "step": 3580 }, { "epoch": 0.4665367121507472, "grad_norm": 0.23875145614147186, "learning_rate": 4.533463287849253e-05, "loss": 0.2095, "step": 3590 }, { "epoch": 0.4678362573099415, "grad_norm": 0.40084120631217957, "learning_rate": 4.5321637426900585e-05, "loss": 0.2363, "step": 3600 }, { "epoch": 0.4691358024691358, "grad_norm": 0.542958676815033, "learning_rate": 4.530864197530865e-05, "loss": 0.1958, "step": 3610 }, { "epoch": 0.47043534762833006, "grad_norm": 0.7646594047546387, "learning_rate": 4.52956465237167e-05, "loss": 0.253, "step": 3620 }, { "epoch": 0.47173489278752434, "grad_norm": 0.3407529890537262, "learning_rate": 4.528265107212476e-05, "loss": 0.2781, "step": 3630 }, { "epoch": 0.4730344379467186, "grad_norm": 0.457732617855072, "learning_rate": 4.5269655620532815e-05, "loss": 0.1756, "step": 3640 }, { "epoch": 0.4743339831059129, "grad_norm": 0.6187317967414856, "learning_rate": 4.525666016894087e-05, "loss": 0.252, "step": 3650 }, { "epoch": 0.4756335282651072, "grad_norm": 0.8452842235565186, "learning_rate": 4.5243664717348934e-05, "loss": 0.2046, "step": 3660 }, { "epoch": 0.47693307342430147, "grad_norm": 0.6108794212341309, "learning_rate": 4.523066926575699e-05, "loss": 0.2115, "step": 3670 }, { "epoch": 0.47823261858349575, "grad_norm": 0.5502861142158508, "learning_rate": 4.5217673814165045e-05, "loss": 0.1908, "step": 3680 }, { "epoch": 0.47953216374269003, "grad_norm": 0.39722931385040283, "learning_rate": 4.52046783625731e-05, "loss": 0.1686, "step": 3690 }, { "epoch": 0.4808317089018843, "grad_norm": 0.4369021952152252, "learning_rate": 4.519168291098116e-05, "loss": 0.2509, "step": 3700 }, { "epoch": 0.4821312540610786, "grad_norm": 0.22782453894615173, "learning_rate": 4.517868745938922e-05, "loss": 0.2144, "step": 3710 }, { "epoch": 0.4834307992202729, "grad_norm": 0.36792558431625366, "learning_rate": 4.5165692007797275e-05, "loss": 0.2924, "step": 3720 }, { "epoch": 0.48473034437946716, "grad_norm": 0.21970559656620026, "learning_rate": 4.515269655620533e-05, "loss": 0.2114, "step": 3730 }, { "epoch": 0.48602988953866144, "grad_norm": 0.843027651309967, "learning_rate": 4.513970110461339e-05, "loss": 0.1807, "step": 3740 }, { "epoch": 0.4873294346978557, "grad_norm": 0.40730229020118713, "learning_rate": 4.512670565302144e-05, "loss": 0.2167, "step": 3750 }, { "epoch": 0.48862897985705, "grad_norm": 0.597312331199646, "learning_rate": 4.5113710201429506e-05, "loss": 0.1966, "step": 3760 }, { "epoch": 0.4899285250162443, "grad_norm": 0.42793285846710205, "learning_rate": 4.510071474983756e-05, "loss": 0.2263, "step": 3770 }, { "epoch": 0.49122807017543857, "grad_norm": 1.016913890838623, "learning_rate": 4.508771929824562e-05, "loss": 0.3853, "step": 3780 }, { "epoch": 0.49252761533463285, "grad_norm": 0.5335705280303955, "learning_rate": 4.507472384665367e-05, "loss": 0.2124, "step": 3790 }, { "epoch": 0.49382716049382713, "grad_norm": 0.5651839375495911, "learning_rate": 4.506172839506173e-05, "loss": 0.1776, "step": 3800 }, { "epoch": 0.4951267056530214, "grad_norm": 0.3866088390350342, "learning_rate": 4.504873294346979e-05, "loss": 0.2154, "step": 3810 }, { "epoch": 0.4964262508122157, "grad_norm": 0.41557425260543823, "learning_rate": 4.503573749187784e-05, "loss": 0.2332, "step": 3820 }, { "epoch": 0.49772579597141, "grad_norm": 0.8121572732925415, "learning_rate": 4.5022742040285903e-05, "loss": 0.2556, "step": 3830 }, { "epoch": 0.49902534113060426, "grad_norm": 0.8512647151947021, "learning_rate": 4.500974658869396e-05, "loss": 0.2254, "step": 3840 }, { "epoch": 0.5003248862897985, "grad_norm": 0.23071441054344177, "learning_rate": 4.4996751137102015e-05, "loss": 0.2563, "step": 3850 }, { "epoch": 0.5016244314489928, "grad_norm": 0.3257962763309479, "learning_rate": 4.498375568551008e-05, "loss": 0.2772, "step": 3860 }, { "epoch": 0.5029239766081871, "grad_norm": 0.5169506669044495, "learning_rate": 4.497076023391813e-05, "loss": 0.2455, "step": 3870 }, { "epoch": 0.5042235217673814, "grad_norm": 0.3578367829322815, "learning_rate": 4.495776478232619e-05, "loss": 0.172, "step": 3880 }, { "epoch": 0.5055230669265757, "grad_norm": 1.1798179149627686, "learning_rate": 4.4944769330734245e-05, "loss": 0.2988, "step": 3890 }, { "epoch": 0.50682261208577, "grad_norm": 0.3117285370826721, "learning_rate": 4.49317738791423e-05, "loss": 0.2424, "step": 3900 }, { "epoch": 0.5081221572449642, "grad_norm": 0.44286423921585083, "learning_rate": 4.4918778427550364e-05, "loss": 0.1742, "step": 3910 }, { "epoch": 0.5094217024041585, "grad_norm": 0.4661107659339905, "learning_rate": 4.490578297595841e-05, "loss": 0.1811, "step": 3920 }, { "epoch": 0.5107212475633528, "grad_norm": 0.23416902124881744, "learning_rate": 4.4892787524366476e-05, "loss": 0.2063, "step": 3930 }, { "epoch": 0.5120207927225471, "grad_norm": 0.5447658896446228, "learning_rate": 4.487979207277453e-05, "loss": 0.2092, "step": 3940 }, { "epoch": 0.5133203378817414, "grad_norm": 0.3650905191898346, "learning_rate": 4.486679662118259e-05, "loss": 0.3, "step": 3950 }, { "epoch": 0.5146198830409356, "grad_norm": 0.5162575840950012, "learning_rate": 4.485380116959065e-05, "loss": 0.2762, "step": 3960 }, { "epoch": 0.5159194282001299, "grad_norm": 1.136186122894287, "learning_rate": 4.48408057179987e-05, "loss": 0.2471, "step": 3970 }, { "epoch": 0.5172189733593242, "grad_norm": 0.5952631235122681, "learning_rate": 4.482781026640676e-05, "loss": 0.217, "step": 3980 }, { "epoch": 0.5185185185185185, "grad_norm": 0.4237302541732788, "learning_rate": 4.481481481481482e-05, "loss": 0.2118, "step": 3990 }, { "epoch": 0.5198180636777128, "grad_norm": 0.654021680355072, "learning_rate": 4.480181936322287e-05, "loss": 0.3091, "step": 4000 }, { "epoch": 0.521117608836907, "grad_norm": 0.8440923094749451, "learning_rate": 4.4788823911630936e-05, "loss": 0.2257, "step": 4010 }, { "epoch": 0.5224171539961013, "grad_norm": 0.2519128918647766, "learning_rate": 4.4775828460038985e-05, "loss": 0.1724, "step": 4020 }, { "epoch": 0.5237166991552956, "grad_norm": 0.5518128275871277, "learning_rate": 4.476283300844705e-05, "loss": 0.1663, "step": 4030 }, { "epoch": 0.5250162443144899, "grad_norm": 0.4135245084762573, "learning_rate": 4.4749837556855103e-05, "loss": 0.1741, "step": 4040 }, { "epoch": 0.5263157894736842, "grad_norm": 0.23150214552879333, "learning_rate": 4.473684210526316e-05, "loss": 0.241, "step": 4050 }, { "epoch": 0.5276153346328785, "grad_norm": 0.2271495908498764, "learning_rate": 4.472384665367122e-05, "loss": 0.2164, "step": 4060 }, { "epoch": 0.5289148797920727, "grad_norm": 0.40374892950057983, "learning_rate": 4.471085120207927e-05, "loss": 0.2698, "step": 4070 }, { "epoch": 0.530214424951267, "grad_norm": 0.6304338574409485, "learning_rate": 4.4697855750487334e-05, "loss": 0.257, "step": 4080 }, { "epoch": 0.5315139701104613, "grad_norm": 0.6797351241111755, "learning_rate": 4.468486029889539e-05, "loss": 0.2608, "step": 4090 }, { "epoch": 0.5328135152696556, "grad_norm": 0.7429770231246948, "learning_rate": 4.4671864847303445e-05, "loss": 0.3059, "step": 4100 }, { "epoch": 0.5341130604288499, "grad_norm": 0.7500605583190918, "learning_rate": 4.465886939571151e-05, "loss": 0.3345, "step": 4110 }, { "epoch": 0.5354126055880442, "grad_norm": 0.32393479347229004, "learning_rate": 4.464587394411956e-05, "loss": 0.1385, "step": 4120 }, { "epoch": 0.5367121507472384, "grad_norm": 0.16345323622226715, "learning_rate": 4.463287849252762e-05, "loss": 0.2191, "step": 4130 }, { "epoch": 0.5380116959064327, "grad_norm": 0.622730016708374, "learning_rate": 4.4619883040935676e-05, "loss": 0.2442, "step": 4140 }, { "epoch": 0.539311241065627, "grad_norm": 0.41071754693984985, "learning_rate": 4.460688758934373e-05, "loss": 0.2106, "step": 4150 }, { "epoch": 0.5406107862248213, "grad_norm": 0.43675366044044495, "learning_rate": 4.4593892137751794e-05, "loss": 0.2158, "step": 4160 }, { "epoch": 0.5419103313840156, "grad_norm": 0.4822518527507782, "learning_rate": 4.458089668615984e-05, "loss": 0.1827, "step": 4170 }, { "epoch": 0.5432098765432098, "grad_norm": 0.5677372813224792, "learning_rate": 4.4567901234567906e-05, "loss": 0.1724, "step": 4180 }, { "epoch": 0.5445094217024041, "grad_norm": 0.7517204880714417, "learning_rate": 4.455490578297596e-05, "loss": 0.1927, "step": 4190 }, { "epoch": 0.5458089668615984, "grad_norm": 0.8886358737945557, "learning_rate": 4.454191033138402e-05, "loss": 0.2457, "step": 4200 }, { "epoch": 0.5471085120207927, "grad_norm": 0.5863087773323059, "learning_rate": 4.452891487979208e-05, "loss": 0.2512, "step": 4210 }, { "epoch": 0.548408057179987, "grad_norm": 0.443200945854187, "learning_rate": 4.451591942820013e-05, "loss": 0.2185, "step": 4220 }, { "epoch": 0.5497076023391813, "grad_norm": 0.30141597986221313, "learning_rate": 4.450292397660819e-05, "loss": 0.2969, "step": 4230 }, { "epoch": 0.5510071474983755, "grad_norm": 0.2529583275318146, "learning_rate": 4.448992852501625e-05, "loss": 0.166, "step": 4240 }, { "epoch": 0.5523066926575698, "grad_norm": 0.7076653242111206, "learning_rate": 4.4476933073424304e-05, "loss": 0.2996, "step": 4250 }, { "epoch": 0.5536062378167641, "grad_norm": 0.320802241563797, "learning_rate": 4.4463937621832366e-05, "loss": 0.1726, "step": 4260 }, { "epoch": 0.5549057829759584, "grad_norm": 0.8083226680755615, "learning_rate": 4.4450942170240415e-05, "loss": 0.2804, "step": 4270 }, { "epoch": 0.5562053281351527, "grad_norm": 0.6102556586265564, "learning_rate": 4.443794671864848e-05, "loss": 0.3012, "step": 4280 }, { "epoch": 0.557504873294347, "grad_norm": 0.6417954564094543, "learning_rate": 4.4424951267056534e-05, "loss": 0.2458, "step": 4290 }, { "epoch": 0.5588044184535412, "grad_norm": 0.7751139998435974, "learning_rate": 4.441195581546459e-05, "loss": 0.3076, "step": 4300 }, { "epoch": 0.5601039636127355, "grad_norm": 0.5540028810501099, "learning_rate": 4.439896036387265e-05, "loss": 0.3099, "step": 4310 }, { "epoch": 0.5614035087719298, "grad_norm": 0.3704449534416199, "learning_rate": 4.43859649122807e-05, "loss": 0.2078, "step": 4320 }, { "epoch": 0.5627030539311241, "grad_norm": 0.36583223938941956, "learning_rate": 4.4372969460688764e-05, "loss": 0.2256, "step": 4330 }, { "epoch": 0.5640025990903184, "grad_norm": 0.8911951184272766, "learning_rate": 4.435997400909681e-05, "loss": 0.2101, "step": 4340 }, { "epoch": 0.5653021442495126, "grad_norm": 1.397807240486145, "learning_rate": 4.4346978557504876e-05, "loss": 0.2366, "step": 4350 }, { "epoch": 0.5666016894087069, "grad_norm": 0.39759477972984314, "learning_rate": 4.433398310591294e-05, "loss": 0.1905, "step": 4360 }, { "epoch": 0.5679012345679012, "grad_norm": 0.3699440062046051, "learning_rate": 4.432098765432099e-05, "loss": 0.2204, "step": 4370 }, { "epoch": 0.5692007797270955, "grad_norm": 0.3914189040660858, "learning_rate": 4.430799220272905e-05, "loss": 0.2439, "step": 4380 }, { "epoch": 0.5705003248862898, "grad_norm": 0.8279275298118591, "learning_rate": 4.42949967511371e-05, "loss": 0.2385, "step": 4390 }, { "epoch": 0.571799870045484, "grad_norm": 0.7355576753616333, "learning_rate": 4.428200129954516e-05, "loss": 0.2915, "step": 4400 }, { "epoch": 0.5730994152046783, "grad_norm": 0.282237708568573, "learning_rate": 4.4269005847953224e-05, "loss": 0.1631, "step": 4410 }, { "epoch": 0.5743989603638726, "grad_norm": 0.42564576864242554, "learning_rate": 4.4256010396361273e-05, "loss": 0.3263, "step": 4420 }, { "epoch": 0.5756985055230669, "grad_norm": 0.2960513234138489, "learning_rate": 4.4243014944769336e-05, "loss": 0.2153, "step": 4430 }, { "epoch": 0.5769980506822612, "grad_norm": 0.5743552446365356, "learning_rate": 4.4230019493177385e-05, "loss": 0.315, "step": 4440 }, { "epoch": 0.5782975958414555, "grad_norm": 0.3289960026741028, "learning_rate": 4.421702404158545e-05, "loss": 0.1805, "step": 4450 }, { "epoch": 0.5795971410006497, "grad_norm": 0.5462039709091187, "learning_rate": 4.4204028589993504e-05, "loss": 0.1933, "step": 4460 }, { "epoch": 0.580896686159844, "grad_norm": 0.37844765186309814, "learning_rate": 4.419103313840156e-05, "loss": 0.1981, "step": 4470 }, { "epoch": 0.5821962313190383, "grad_norm": 1.0273895263671875, "learning_rate": 4.417803768680962e-05, "loss": 0.2841, "step": 4480 }, { "epoch": 0.5834957764782326, "grad_norm": 0.6965369582176208, "learning_rate": 4.416504223521767e-05, "loss": 0.2434, "step": 4490 }, { "epoch": 0.5847953216374269, "grad_norm": 0.33003857731819153, "learning_rate": 4.4152046783625734e-05, "loss": 0.1686, "step": 4500 }, { "epoch": 0.5860948667966212, "grad_norm": 0.3900255858898163, "learning_rate": 4.413905133203379e-05, "loss": 0.2266, "step": 4510 }, { "epoch": 0.5873944119558154, "grad_norm": 0.27776652574539185, "learning_rate": 4.4126055880441846e-05, "loss": 0.2103, "step": 4520 }, { "epoch": 0.5886939571150097, "grad_norm": 0.20968852937221527, "learning_rate": 4.411306042884991e-05, "loss": 0.1869, "step": 4530 }, { "epoch": 0.589993502274204, "grad_norm": 0.7589280009269714, "learning_rate": 4.410006497725796e-05, "loss": 0.2136, "step": 4540 }, { "epoch": 0.5912930474333983, "grad_norm": 0.39138051867485046, "learning_rate": 4.408706952566602e-05, "loss": 0.1881, "step": 4550 }, { "epoch": 0.5925925925925926, "grad_norm": 0.9654112458229065, "learning_rate": 4.4074074074074076e-05, "loss": 0.3216, "step": 4560 }, { "epoch": 0.5938921377517868, "grad_norm": 0.6908769607543945, "learning_rate": 4.406107862248213e-05, "loss": 0.2395, "step": 4570 }, { "epoch": 0.5951916829109811, "grad_norm": 0.2250167429447174, "learning_rate": 4.4048083170890194e-05, "loss": 0.1283, "step": 4580 }, { "epoch": 0.5964912280701754, "grad_norm": 0.4783670902252197, "learning_rate": 4.403508771929824e-05, "loss": 0.1476, "step": 4590 }, { "epoch": 0.5977907732293697, "grad_norm": 0.7505614757537842, "learning_rate": 4.4022092267706306e-05, "loss": 0.2407, "step": 4600 }, { "epoch": 0.599090318388564, "grad_norm": 0.579213559627533, "learning_rate": 4.400909681611436e-05, "loss": 0.3207, "step": 4610 }, { "epoch": 0.6003898635477583, "grad_norm": 0.3951951265335083, "learning_rate": 4.399610136452242e-05, "loss": 0.2198, "step": 4620 }, { "epoch": 0.6016894087069525, "grad_norm": 0.2682746648788452, "learning_rate": 4.398310591293048e-05, "loss": 0.2282, "step": 4630 }, { "epoch": 0.6029889538661468, "grad_norm": 0.32232990860939026, "learning_rate": 4.397011046133853e-05, "loss": 0.133, "step": 4640 }, { "epoch": 0.6042884990253411, "grad_norm": 0.5846036076545715, "learning_rate": 4.395711500974659e-05, "loss": 0.1727, "step": 4650 }, { "epoch": 0.6055880441845354, "grad_norm": 0.4481455683708191, "learning_rate": 4.394411955815465e-05, "loss": 0.2258, "step": 4660 }, { "epoch": 0.6068875893437297, "grad_norm": 0.2681124806404114, "learning_rate": 4.3931124106562704e-05, "loss": 0.2066, "step": 4670 }, { "epoch": 0.6081871345029239, "grad_norm": 0.27545416355133057, "learning_rate": 4.3918128654970766e-05, "loss": 0.2753, "step": 4680 }, { "epoch": 0.6094866796621182, "grad_norm": 0.3981437683105469, "learning_rate": 4.3905133203378815e-05, "loss": 0.2088, "step": 4690 }, { "epoch": 0.6107862248213125, "grad_norm": 0.16542582213878632, "learning_rate": 4.389213775178688e-05, "loss": 0.3656, "step": 4700 }, { "epoch": 0.6120857699805068, "grad_norm": 0.771787703037262, "learning_rate": 4.3879142300194934e-05, "loss": 0.1602, "step": 4710 }, { "epoch": 0.6133853151397011, "grad_norm": 0.5807725191116333, "learning_rate": 4.386614684860299e-05, "loss": 0.1938, "step": 4720 }, { "epoch": 0.6146848602988954, "grad_norm": 0.37076622247695923, "learning_rate": 4.385315139701105e-05, "loss": 0.1934, "step": 4730 }, { "epoch": 0.6159844054580896, "grad_norm": 0.5894520282745361, "learning_rate": 4.38401559454191e-05, "loss": 0.1691, "step": 4740 }, { "epoch": 0.6172839506172839, "grad_norm": 0.24706336855888367, "learning_rate": 4.3827160493827164e-05, "loss": 0.2664, "step": 4750 }, { "epoch": 0.6185834957764782, "grad_norm": 0.6210567951202393, "learning_rate": 4.381416504223522e-05, "loss": 0.2336, "step": 4760 }, { "epoch": 0.6198830409356725, "grad_norm": 0.5899829864501953, "learning_rate": 4.3801169590643276e-05, "loss": 0.3062, "step": 4770 }, { "epoch": 0.6211825860948668, "grad_norm": 0.274596631526947, "learning_rate": 4.378817413905134e-05, "loss": 0.2011, "step": 4780 }, { "epoch": 0.622482131254061, "grad_norm": 0.48855844140052795, "learning_rate": 4.377517868745939e-05, "loss": 0.2677, "step": 4790 }, { "epoch": 0.6237816764132553, "grad_norm": 0.3345434367656708, "learning_rate": 4.376218323586745e-05, "loss": 0.17, "step": 4800 }, { "epoch": 0.6250812215724496, "grad_norm": 0.627191960811615, "learning_rate": 4.3749187784275506e-05, "loss": 0.2191, "step": 4810 }, { "epoch": 0.6263807667316439, "grad_norm": 0.4553559124469757, "learning_rate": 4.373619233268356e-05, "loss": 0.2067, "step": 4820 }, { "epoch": 0.6276803118908382, "grad_norm": 0.3124532997608185, "learning_rate": 4.3723196881091624e-05, "loss": 0.1629, "step": 4830 }, { "epoch": 0.6289798570500325, "grad_norm": 0.5788019299507141, "learning_rate": 4.3710201429499674e-05, "loss": 0.2113, "step": 4840 }, { "epoch": 0.6302794022092267, "grad_norm": 0.7147088050842285, "learning_rate": 4.3697205977907736e-05, "loss": 0.1545, "step": 4850 }, { "epoch": 0.631578947368421, "grad_norm": 0.894290030002594, "learning_rate": 4.368421052631579e-05, "loss": 0.203, "step": 4860 }, { "epoch": 0.6328784925276153, "grad_norm": 0.3420340418815613, "learning_rate": 4.367121507472385e-05, "loss": 0.2405, "step": 4870 }, { "epoch": 0.6341780376868096, "grad_norm": 0.19169357419013977, "learning_rate": 4.365821962313191e-05, "loss": 0.1705, "step": 4880 }, { "epoch": 0.6354775828460039, "grad_norm": 0.3293217420578003, "learning_rate": 4.364522417153996e-05, "loss": 0.1886, "step": 4890 }, { "epoch": 0.6367771280051981, "grad_norm": 0.37663987278938293, "learning_rate": 4.363222871994802e-05, "loss": 0.2131, "step": 4900 }, { "epoch": 0.6380766731643924, "grad_norm": 0.2860927879810333, "learning_rate": 4.361923326835608e-05, "loss": 0.181, "step": 4910 }, { "epoch": 0.6393762183235867, "grad_norm": 0.4331021010875702, "learning_rate": 4.3606237816764134e-05, "loss": 0.2779, "step": 4920 }, { "epoch": 0.640675763482781, "grad_norm": 0.4157128930091858, "learning_rate": 4.3593242365172197e-05, "loss": 0.2075, "step": 4930 }, { "epoch": 0.6419753086419753, "grad_norm": 0.25249797105789185, "learning_rate": 4.3580246913580246e-05, "loss": 0.2147, "step": 4940 }, { "epoch": 0.6432748538011696, "grad_norm": 0.37018197774887085, "learning_rate": 4.356725146198831e-05, "loss": 0.1483, "step": 4950 }, { "epoch": 0.6445743989603638, "grad_norm": 0.5524932742118835, "learning_rate": 4.3554256010396364e-05, "loss": 0.2412, "step": 4960 }, { "epoch": 0.6458739441195581, "grad_norm": 0.4585111737251282, "learning_rate": 4.354126055880442e-05, "loss": 0.2086, "step": 4970 }, { "epoch": 0.6471734892787524, "grad_norm": 0.3423636853694916, "learning_rate": 4.3528265107212476e-05, "loss": 0.3307, "step": 4980 }, { "epoch": 0.6484730344379467, "grad_norm": 0.4783909022808075, "learning_rate": 4.351526965562053e-05, "loss": 0.1838, "step": 4990 }, { "epoch": 0.649772579597141, "grad_norm": 0.6436142325401306, "learning_rate": 4.3502274204028594e-05, "loss": 0.1608, "step": 5000 }, { "epoch": 0.6510721247563352, "grad_norm": 0.5025466680526733, "learning_rate": 4.348927875243665e-05, "loss": 0.1701, "step": 5010 }, { "epoch": 0.6523716699155295, "grad_norm": 0.21982286870479584, "learning_rate": 4.3476283300844706e-05, "loss": 0.1943, "step": 5020 }, { "epoch": 0.6536712150747238, "grad_norm": 0.38647371530532837, "learning_rate": 4.346328784925276e-05, "loss": 0.1888, "step": 5030 }, { "epoch": 0.6549707602339181, "grad_norm": 0.5871855020523071, "learning_rate": 4.345029239766082e-05, "loss": 0.2427, "step": 5040 }, { "epoch": 0.6562703053931124, "grad_norm": 0.12435946613550186, "learning_rate": 4.343729694606888e-05, "loss": 0.1837, "step": 5050 }, { "epoch": 0.6575698505523067, "grad_norm": 0.4269551634788513, "learning_rate": 4.3424301494476936e-05, "loss": 0.2171, "step": 5060 }, { "epoch": 0.6588693957115009, "grad_norm": 0.4881094694137573, "learning_rate": 4.341130604288499e-05, "loss": 0.2127, "step": 5070 }, { "epoch": 0.6601689408706952, "grad_norm": 0.5135766863822937, "learning_rate": 4.339831059129305e-05, "loss": 0.1874, "step": 5080 }, { "epoch": 0.6614684860298895, "grad_norm": 0.4760465621948242, "learning_rate": 4.3385315139701104e-05, "loss": 0.1803, "step": 5090 }, { "epoch": 0.6627680311890838, "grad_norm": 0.24038401246070862, "learning_rate": 4.3372319688109166e-05, "loss": 0.1954, "step": 5100 }, { "epoch": 0.6640675763482781, "grad_norm": 0.7148582339286804, "learning_rate": 4.335932423651722e-05, "loss": 0.2235, "step": 5110 }, { "epoch": 0.6653671215074723, "grad_norm": 0.4967847168445587, "learning_rate": 4.334632878492528e-05, "loss": 0.1409, "step": 5120 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7088788151741028, "learning_rate": 4.3333333333333334e-05, "loss": 0.2385, "step": 5130 }, { "epoch": 0.6679662118258609, "grad_norm": 0.33680951595306396, "learning_rate": 4.332033788174139e-05, "loss": 0.201, "step": 5140 }, { "epoch": 0.6692657569850552, "grad_norm": 0.48232099413871765, "learning_rate": 4.330734243014945e-05, "loss": 0.2199, "step": 5150 }, { "epoch": 0.6705653021442495, "grad_norm": 0.35374853014945984, "learning_rate": 4.329434697855751e-05, "loss": 0.2069, "step": 5160 }, { "epoch": 0.6718648473034438, "grad_norm": 0.4539322257041931, "learning_rate": 4.3281351526965564e-05, "loss": 0.1381, "step": 5170 }, { "epoch": 0.673164392462638, "grad_norm": 0.3309534788131714, "learning_rate": 4.326835607537362e-05, "loss": 0.2018, "step": 5180 }, { "epoch": 0.6744639376218323, "grad_norm": 0.294218510389328, "learning_rate": 4.3255360623781676e-05, "loss": 0.2824, "step": 5190 }, { "epoch": 0.6757634827810266, "grad_norm": 0.15276280045509338, "learning_rate": 4.324236517218974e-05, "loss": 0.1338, "step": 5200 }, { "epoch": 0.6770630279402209, "grad_norm": 0.35392406582832336, "learning_rate": 4.3229369720597794e-05, "loss": 0.1696, "step": 5210 }, { "epoch": 0.6783625730994152, "grad_norm": 0.24682119488716125, "learning_rate": 4.321637426900585e-05, "loss": 0.2801, "step": 5220 }, { "epoch": 0.6796621182586094, "grad_norm": 0.7548143267631531, "learning_rate": 4.3203378817413906e-05, "loss": 0.1873, "step": 5230 }, { "epoch": 0.6809616634178037, "grad_norm": 0.9052791595458984, "learning_rate": 4.319038336582196e-05, "loss": 0.2203, "step": 5240 }, { "epoch": 0.682261208576998, "grad_norm": 0.4122040271759033, "learning_rate": 4.3177387914230025e-05, "loss": 0.1598, "step": 5250 }, { "epoch": 0.6835607537361923, "grad_norm": 0.42296868562698364, "learning_rate": 4.316439246263808e-05, "loss": 0.1236, "step": 5260 }, { "epoch": 0.6848602988953866, "grad_norm": 0.5848291516304016, "learning_rate": 4.3151397011046136e-05, "loss": 0.2376, "step": 5270 }, { "epoch": 0.6861598440545809, "grad_norm": 0.3748939037322998, "learning_rate": 4.313840155945419e-05, "loss": 0.232, "step": 5280 }, { "epoch": 0.6874593892137751, "grad_norm": 0.3469703495502472, "learning_rate": 4.312540610786225e-05, "loss": 0.3155, "step": 5290 }, { "epoch": 0.6887589343729694, "grad_norm": 0.6326718926429749, "learning_rate": 4.311241065627031e-05, "loss": 0.1974, "step": 5300 }, { "epoch": 0.6900584795321637, "grad_norm": 0.5056907534599304, "learning_rate": 4.3099415204678367e-05, "loss": 0.1835, "step": 5310 }, { "epoch": 0.691358024691358, "grad_norm": 0.3044179677963257, "learning_rate": 4.308641975308642e-05, "loss": 0.2663, "step": 5320 }, { "epoch": 0.6926575698505523, "grad_norm": 0.2963675856590271, "learning_rate": 4.307342430149448e-05, "loss": 0.156, "step": 5330 }, { "epoch": 0.6939571150097466, "grad_norm": 0.48213091492652893, "learning_rate": 4.3060428849902534e-05, "loss": 0.2522, "step": 5340 }, { "epoch": 0.6952566601689408, "grad_norm": 0.4289150834083557, "learning_rate": 4.30474333983106e-05, "loss": 0.2591, "step": 5350 }, { "epoch": 0.6965562053281351, "grad_norm": 0.37640947103500366, "learning_rate": 4.303443794671865e-05, "loss": 0.2258, "step": 5360 }, { "epoch": 0.6978557504873294, "grad_norm": 0.4674929976463318, "learning_rate": 4.302144249512671e-05, "loss": 0.2552, "step": 5370 }, { "epoch": 0.6991552956465237, "grad_norm": 0.18097370862960815, "learning_rate": 4.3008447043534764e-05, "loss": 0.2078, "step": 5380 }, { "epoch": 0.700454840805718, "grad_norm": 0.2306859791278839, "learning_rate": 4.299545159194282e-05, "loss": 0.1884, "step": 5390 }, { "epoch": 0.7017543859649122, "grad_norm": 0.4810875952243805, "learning_rate": 4.298245614035088e-05, "loss": 0.2885, "step": 5400 }, { "epoch": 0.7030539311241065, "grad_norm": 0.19140636920928955, "learning_rate": 4.296946068875894e-05, "loss": 0.2255, "step": 5410 }, { "epoch": 0.7043534762833008, "grad_norm": 1.1745812892913818, "learning_rate": 4.2956465237166995e-05, "loss": 0.1817, "step": 5420 }, { "epoch": 0.7056530214424951, "grad_norm": 0.9129042029380798, "learning_rate": 4.294346978557505e-05, "loss": 0.2875, "step": 5430 }, { "epoch": 0.7069525666016894, "grad_norm": 1.026507019996643, "learning_rate": 4.2930474333983106e-05, "loss": 0.2377, "step": 5440 }, { "epoch": 0.7082521117608837, "grad_norm": 0.3732326030731201, "learning_rate": 4.291747888239117e-05, "loss": 0.2389, "step": 5450 }, { "epoch": 0.7095516569200779, "grad_norm": 0.24285253882408142, "learning_rate": 4.2904483430799225e-05, "loss": 0.1934, "step": 5460 }, { "epoch": 0.7108512020792722, "grad_norm": 0.634590744972229, "learning_rate": 4.289148797920728e-05, "loss": 0.2171, "step": 5470 }, { "epoch": 0.7121507472384665, "grad_norm": 0.35189372301101685, "learning_rate": 4.2878492527615336e-05, "loss": 0.2151, "step": 5480 }, { "epoch": 0.7134502923976608, "grad_norm": 0.3360569477081299, "learning_rate": 4.286549707602339e-05, "loss": 0.209, "step": 5490 }, { "epoch": 0.7147498375568551, "grad_norm": 0.22465412318706512, "learning_rate": 4.285250162443145e-05, "loss": 0.1623, "step": 5500 }, { "epoch": 0.7160493827160493, "grad_norm": 0.6846732497215271, "learning_rate": 4.283950617283951e-05, "loss": 0.2896, "step": 5510 }, { "epoch": 0.7173489278752436, "grad_norm": 0.7119432687759399, "learning_rate": 4.282651072124757e-05, "loss": 0.2144, "step": 5520 }, { "epoch": 0.7186484730344379, "grad_norm": 1.0230082273483276, "learning_rate": 4.281351526965562e-05, "loss": 0.2062, "step": 5530 }, { "epoch": 0.7199480181936322, "grad_norm": 0.8501009941101074, "learning_rate": 4.280051981806368e-05, "loss": 0.2972, "step": 5540 }, { "epoch": 0.7212475633528265, "grad_norm": 0.5222442746162415, "learning_rate": 4.2787524366471734e-05, "loss": 0.1667, "step": 5550 }, { "epoch": 0.7225471085120208, "grad_norm": 0.353628545999527, "learning_rate": 4.27745289148798e-05, "loss": 0.205, "step": 5560 }, { "epoch": 0.723846653671215, "grad_norm": 0.40176674723625183, "learning_rate": 4.276153346328785e-05, "loss": 0.2016, "step": 5570 }, { "epoch": 0.7251461988304093, "grad_norm": 0.7965889573097229, "learning_rate": 4.274853801169591e-05, "loss": 0.2415, "step": 5580 }, { "epoch": 0.7264457439896036, "grad_norm": 0.6789657473564148, "learning_rate": 4.2735542560103964e-05, "loss": 0.2312, "step": 5590 }, { "epoch": 0.7277452891487979, "grad_norm": 0.2648601233959198, "learning_rate": 4.272254710851202e-05, "loss": 0.1939, "step": 5600 }, { "epoch": 0.7290448343079922, "grad_norm": 0.53989177942276, "learning_rate": 4.270955165692008e-05, "loss": 0.1922, "step": 5610 }, { "epoch": 0.7303443794671864, "grad_norm": 2.146649122238159, "learning_rate": 4.269655620532814e-05, "loss": 0.1714, "step": 5620 }, { "epoch": 0.7316439246263807, "grad_norm": 0.7165536880493164, "learning_rate": 4.2683560753736195e-05, "loss": 0.177, "step": 5630 }, { "epoch": 0.732943469785575, "grad_norm": 0.7613852620124817, "learning_rate": 4.267056530214425e-05, "loss": 0.312, "step": 5640 }, { "epoch": 0.7342430149447693, "grad_norm": 0.23699839413166046, "learning_rate": 4.2657569850552306e-05, "loss": 0.3034, "step": 5650 }, { "epoch": 0.7355425601039636, "grad_norm": 0.40769994258880615, "learning_rate": 4.264457439896037e-05, "loss": 0.1668, "step": 5660 }, { "epoch": 0.7368421052631579, "grad_norm": 0.7246720790863037, "learning_rate": 4.2631578947368425e-05, "loss": 0.2748, "step": 5670 }, { "epoch": 0.7381416504223521, "grad_norm": 0.41772979497909546, "learning_rate": 4.261858349577648e-05, "loss": 0.1984, "step": 5680 }, { "epoch": 0.7394411955815464, "grad_norm": 0.7313483953475952, "learning_rate": 4.2605588044184537e-05, "loss": 0.209, "step": 5690 }, { "epoch": 0.7407407407407407, "grad_norm": 0.3764808475971222, "learning_rate": 4.259259259259259e-05, "loss": 0.2591, "step": 5700 }, { "epoch": 0.742040285899935, "grad_norm": 0.4105078876018524, "learning_rate": 4.2579597141000655e-05, "loss": 0.2286, "step": 5710 }, { "epoch": 0.7433398310591293, "grad_norm": 0.38880786299705505, "learning_rate": 4.256660168940871e-05, "loss": 0.1773, "step": 5720 }, { "epoch": 0.7446393762183235, "grad_norm": 0.40069103240966797, "learning_rate": 4.255360623781677e-05, "loss": 0.1675, "step": 5730 }, { "epoch": 0.7459389213775178, "grad_norm": 0.1468103677034378, "learning_rate": 4.254061078622482e-05, "loss": 0.2229, "step": 5740 }, { "epoch": 0.7472384665367121, "grad_norm": 1.2858657836914062, "learning_rate": 4.252761533463288e-05, "loss": 0.278, "step": 5750 }, { "epoch": 0.7485380116959064, "grad_norm": 0.6086114048957825, "learning_rate": 4.251461988304094e-05, "loss": 0.3552, "step": 5760 }, { "epoch": 0.7498375568551007, "grad_norm": 0.3895244598388672, "learning_rate": 4.2501624431449e-05, "loss": 0.2241, "step": 5770 }, { "epoch": 0.751137102014295, "grad_norm": 0.6180068850517273, "learning_rate": 4.248862897985705e-05, "loss": 0.2271, "step": 5780 }, { "epoch": 0.7524366471734892, "grad_norm": 0.8005626797676086, "learning_rate": 4.247563352826511e-05, "loss": 0.2723, "step": 5790 }, { "epoch": 0.7537361923326835, "grad_norm": 0.28485116362571716, "learning_rate": 4.2462638076673164e-05, "loss": 0.1696, "step": 5800 }, { "epoch": 0.7550357374918778, "grad_norm": 0.25198036432266235, "learning_rate": 4.244964262508123e-05, "loss": 0.2573, "step": 5810 }, { "epoch": 0.7563352826510721, "grad_norm": 0.16276606917381287, "learning_rate": 4.243664717348928e-05, "loss": 0.2352, "step": 5820 }, { "epoch": 0.7576348278102664, "grad_norm": 1.0853996276855469, "learning_rate": 4.242365172189734e-05, "loss": 0.1639, "step": 5830 }, { "epoch": 0.7589343729694606, "grad_norm": 0.5278126001358032, "learning_rate": 4.2410656270305395e-05, "loss": 0.1527, "step": 5840 }, { "epoch": 0.7602339181286549, "grad_norm": 0.36478716135025024, "learning_rate": 4.239766081871345e-05, "loss": 0.2305, "step": 5850 }, { "epoch": 0.7615334632878492, "grad_norm": 0.18922503292560577, "learning_rate": 4.238466536712151e-05, "loss": 0.2212, "step": 5860 }, { "epoch": 0.7628330084470435, "grad_norm": 0.27614670991897583, "learning_rate": 4.237166991552957e-05, "loss": 0.1898, "step": 5870 }, { "epoch": 0.7641325536062378, "grad_norm": 0.25848010182380676, "learning_rate": 4.2358674463937625e-05, "loss": 0.1537, "step": 5880 }, { "epoch": 0.7654320987654321, "grad_norm": 0.3259480893611908, "learning_rate": 4.234567901234568e-05, "loss": 0.2984, "step": 5890 }, { "epoch": 0.7667316439246263, "grad_norm": 0.3170371651649475, "learning_rate": 4.2332683560753737e-05, "loss": 0.2573, "step": 5900 }, { "epoch": 0.7680311890838206, "grad_norm": 0.3089860677719116, "learning_rate": 4.23196881091618e-05, "loss": 0.2146, "step": 5910 }, { "epoch": 0.7693307342430149, "grad_norm": 1.2379772663116455, "learning_rate": 4.2306692657569855e-05, "loss": 0.2815, "step": 5920 }, { "epoch": 0.7706302794022092, "grad_norm": 0.7321758270263672, "learning_rate": 4.229369720597791e-05, "loss": 0.25, "step": 5930 }, { "epoch": 0.7719298245614035, "grad_norm": 0.5143493413925171, "learning_rate": 4.228070175438597e-05, "loss": 0.2419, "step": 5940 }, { "epoch": 0.7732293697205977, "grad_norm": 0.3954801857471466, "learning_rate": 4.226770630279402e-05, "loss": 0.1903, "step": 5950 }, { "epoch": 0.774528914879792, "grad_norm": 0.675966203212738, "learning_rate": 4.2254710851202085e-05, "loss": 0.254, "step": 5960 }, { "epoch": 0.7758284600389863, "grad_norm": 0.24206072092056274, "learning_rate": 4.224171539961014e-05, "loss": 0.172, "step": 5970 }, { "epoch": 0.7771280051981806, "grad_norm": 0.15223293006420135, "learning_rate": 4.22287199480182e-05, "loss": 0.1584, "step": 5980 }, { "epoch": 0.7784275503573749, "grad_norm": 0.364341676235199, "learning_rate": 4.221572449642625e-05, "loss": 0.1803, "step": 5990 }, { "epoch": 0.7797270955165692, "grad_norm": 0.43805640935897827, "learning_rate": 4.220272904483431e-05, "loss": 0.2095, "step": 6000 }, { "epoch": 0.7810266406757634, "grad_norm": 0.43709149956703186, "learning_rate": 4.218973359324237e-05, "loss": 0.2114, "step": 6010 }, { "epoch": 0.7823261858349577, "grad_norm": 0.767943799495697, "learning_rate": 4.217673814165042e-05, "loss": 0.1825, "step": 6020 }, { "epoch": 0.783625730994152, "grad_norm": 0.16773156821727753, "learning_rate": 4.216374269005848e-05, "loss": 0.2264, "step": 6030 }, { "epoch": 0.7849252761533463, "grad_norm": 0.20439395308494568, "learning_rate": 4.215074723846654e-05, "loss": 0.2195, "step": 6040 }, { "epoch": 0.7862248213125406, "grad_norm": 0.1612890064716339, "learning_rate": 4.2137751786874595e-05, "loss": 0.2341, "step": 6050 }, { "epoch": 0.7875243664717348, "grad_norm": 1.0032081604003906, "learning_rate": 4.212475633528266e-05, "loss": 0.1846, "step": 6060 }, { "epoch": 0.7888239116309291, "grad_norm": 0.3012808561325073, "learning_rate": 4.2111760883690706e-05, "loss": 0.1952, "step": 6070 }, { "epoch": 0.7901234567901234, "grad_norm": 0.24244408309459686, "learning_rate": 4.209876543209877e-05, "loss": 0.1849, "step": 6080 }, { "epoch": 0.7914230019493177, "grad_norm": 0.3712264895439148, "learning_rate": 4.2085769980506825e-05, "loss": 0.3039, "step": 6090 }, { "epoch": 0.792722547108512, "grad_norm": 0.27803897857666016, "learning_rate": 4.207277452891488e-05, "loss": 0.1602, "step": 6100 }, { "epoch": 0.7940220922677063, "grad_norm": 0.5271960496902466, "learning_rate": 4.2059779077322943e-05, "loss": 0.2738, "step": 6110 }, { "epoch": 0.7953216374269005, "grad_norm": 0.1811026632785797, "learning_rate": 4.204678362573099e-05, "loss": 0.1997, "step": 6120 }, { "epoch": 0.7966211825860948, "grad_norm": 0.3444100022315979, "learning_rate": 4.2033788174139055e-05, "loss": 0.2304, "step": 6130 }, { "epoch": 0.7979207277452891, "grad_norm": 0.7824494242668152, "learning_rate": 4.202079272254711e-05, "loss": 0.2094, "step": 6140 }, { "epoch": 0.7992202729044834, "grad_norm": 0.46165722608566284, "learning_rate": 4.200779727095517e-05, "loss": 0.2057, "step": 6150 }, { "epoch": 0.8005198180636777, "grad_norm": 0.22319340705871582, "learning_rate": 4.199480181936323e-05, "loss": 0.1772, "step": 6160 }, { "epoch": 0.801819363222872, "grad_norm": 0.41006097197532654, "learning_rate": 4.198180636777128e-05, "loss": 0.202, "step": 6170 }, { "epoch": 0.8031189083820662, "grad_norm": 0.5088682174682617, "learning_rate": 4.196881091617934e-05, "loss": 0.1907, "step": 6180 }, { "epoch": 0.8044184535412605, "grad_norm": 0.6273952126502991, "learning_rate": 4.19558154645874e-05, "loss": 0.1513, "step": 6190 }, { "epoch": 0.8057179987004548, "grad_norm": 0.6623740196228027, "learning_rate": 4.194282001299545e-05, "loss": 0.1892, "step": 6200 }, { "epoch": 0.8070175438596491, "grad_norm": 0.5599991083145142, "learning_rate": 4.1929824561403516e-05, "loss": 0.1412, "step": 6210 }, { "epoch": 0.8083170890188434, "grad_norm": 0.4611224830150604, "learning_rate": 4.1916829109811565e-05, "loss": 0.344, "step": 6220 }, { "epoch": 0.8096166341780376, "grad_norm": 0.5066773295402527, "learning_rate": 4.190383365821963e-05, "loss": 0.2266, "step": 6230 }, { "epoch": 0.8109161793372319, "grad_norm": 0.7974432706832886, "learning_rate": 4.189083820662768e-05, "loss": 0.2188, "step": 6240 }, { "epoch": 0.8122157244964262, "grad_norm": 0.2635398209095001, "learning_rate": 4.187784275503574e-05, "loss": 0.1845, "step": 6250 }, { "epoch": 0.8135152696556205, "grad_norm": 0.8352042436599731, "learning_rate": 4.18648473034438e-05, "loss": 0.2051, "step": 6260 }, { "epoch": 0.8148148148148148, "grad_norm": 0.3418456017971039, "learning_rate": 4.185185185185185e-05, "loss": 0.1917, "step": 6270 }, { "epoch": 0.816114359974009, "grad_norm": 0.6361962556838989, "learning_rate": 4.183885640025991e-05, "loss": 0.2572, "step": 6280 }, { "epoch": 0.8174139051332033, "grad_norm": 0.39436718821525574, "learning_rate": 4.182586094866797e-05, "loss": 0.2083, "step": 6290 }, { "epoch": 0.8187134502923976, "grad_norm": 0.6838942766189575, "learning_rate": 4.1812865497076025e-05, "loss": 0.3731, "step": 6300 }, { "epoch": 0.8200129954515919, "grad_norm": 0.3395749628543854, "learning_rate": 4.179987004548409e-05, "loss": 0.1689, "step": 6310 }, { "epoch": 0.8213125406107862, "grad_norm": 0.26529544591903687, "learning_rate": 4.178687459389214e-05, "loss": 0.0963, "step": 6320 }, { "epoch": 0.8226120857699805, "grad_norm": 0.6738656759262085, "learning_rate": 4.17738791423002e-05, "loss": 0.2343, "step": 6330 }, { "epoch": 0.8239116309291747, "grad_norm": 0.3984090983867645, "learning_rate": 4.1760883690708255e-05, "loss": 0.1724, "step": 6340 }, { "epoch": 0.825211176088369, "grad_norm": 0.41444432735443115, "learning_rate": 4.174788823911631e-05, "loss": 0.2572, "step": 6350 }, { "epoch": 0.8265107212475633, "grad_norm": 0.39478451013565063, "learning_rate": 4.1734892787524374e-05, "loss": 0.1908, "step": 6360 }, { "epoch": 0.8278102664067576, "grad_norm": 0.19381815195083618, "learning_rate": 4.172189733593242e-05, "loss": 0.2506, "step": 6370 }, { "epoch": 0.8291098115659519, "grad_norm": 0.5495195388793945, "learning_rate": 4.1708901884340485e-05, "loss": 0.2109, "step": 6380 }, { "epoch": 0.8304093567251462, "grad_norm": 0.9092202186584473, "learning_rate": 4.169590643274854e-05, "loss": 0.2253, "step": 6390 }, { "epoch": 0.8317089018843404, "grad_norm": 0.43999183177948, "learning_rate": 4.16829109811566e-05, "loss": 0.2269, "step": 6400 }, { "epoch": 0.8330084470435347, "grad_norm": 0.45947709679603577, "learning_rate": 4.166991552956466e-05, "loss": 0.3167, "step": 6410 }, { "epoch": 0.834307992202729, "grad_norm": 0.3368842303752899, "learning_rate": 4.165692007797271e-05, "loss": 0.1964, "step": 6420 }, { "epoch": 0.8356075373619233, "grad_norm": 0.4289727509021759, "learning_rate": 4.164392462638077e-05, "loss": 0.2331, "step": 6430 }, { "epoch": 0.8369070825211176, "grad_norm": 0.5656487941741943, "learning_rate": 4.163092917478883e-05, "loss": 0.3291, "step": 6440 }, { "epoch": 0.8382066276803118, "grad_norm": 0.986635684967041, "learning_rate": 4.161793372319688e-05, "loss": 0.1877, "step": 6450 }, { "epoch": 0.8395061728395061, "grad_norm": 0.9177766442298889, "learning_rate": 4.1604938271604946e-05, "loss": 0.1992, "step": 6460 }, { "epoch": 0.8408057179987004, "grad_norm": 0.5305169224739075, "learning_rate": 4.1591942820012995e-05, "loss": 0.2453, "step": 6470 }, { "epoch": 0.8421052631578947, "grad_norm": 0.4135533571243286, "learning_rate": 4.157894736842106e-05, "loss": 0.1837, "step": 6480 }, { "epoch": 0.843404808317089, "grad_norm": 0.4275001883506775, "learning_rate": 4.1565951916829113e-05, "loss": 0.1592, "step": 6490 }, { "epoch": 0.8447043534762833, "grad_norm": 0.8216626048088074, "learning_rate": 4.155295646523717e-05, "loss": 0.179, "step": 6500 }, { "epoch": 0.8460038986354775, "grad_norm": 0.47325748205184937, "learning_rate": 4.153996101364523e-05, "loss": 0.2204, "step": 6510 }, { "epoch": 0.8473034437946718, "grad_norm": 0.8100810647010803, "learning_rate": 4.152696556205328e-05, "loss": 0.1623, "step": 6520 }, { "epoch": 0.8486029889538661, "grad_norm": 0.5685278177261353, "learning_rate": 4.1513970110461344e-05, "loss": 0.2142, "step": 6530 }, { "epoch": 0.8499025341130604, "grad_norm": 0.24305205047130585, "learning_rate": 4.150097465886939e-05, "loss": 0.346, "step": 6540 }, { "epoch": 0.8512020792722547, "grad_norm": 0.5188000202178955, "learning_rate": 4.1487979207277455e-05, "loss": 0.2411, "step": 6550 }, { "epoch": 0.852501624431449, "grad_norm": 0.22426795959472656, "learning_rate": 4.147498375568552e-05, "loss": 0.1489, "step": 6560 }, { "epoch": 0.8538011695906432, "grad_norm": 0.5084758996963501, "learning_rate": 4.146198830409357e-05, "loss": 0.227, "step": 6570 }, { "epoch": 0.8551007147498375, "grad_norm": 1.6684917211532593, "learning_rate": 4.144899285250163e-05, "loss": 0.2439, "step": 6580 }, { "epoch": 0.8564002599090318, "grad_norm": 0.33544299006462097, "learning_rate": 4.143599740090968e-05, "loss": 0.2363, "step": 6590 }, { "epoch": 0.8576998050682261, "grad_norm": 0.5573083758354187, "learning_rate": 4.142300194931774e-05, "loss": 0.2947, "step": 6600 }, { "epoch": 0.8589993502274204, "grad_norm": 0.6592196822166443, "learning_rate": 4.1410006497725804e-05, "loss": 0.1849, "step": 6610 }, { "epoch": 0.8602988953866146, "grad_norm": 0.4175536334514618, "learning_rate": 4.139701104613385e-05, "loss": 0.216, "step": 6620 }, { "epoch": 0.8615984405458089, "grad_norm": 0.4865637719631195, "learning_rate": 4.1384015594541916e-05, "loss": 0.2259, "step": 6630 }, { "epoch": 0.8628979857050032, "grad_norm": 0.7458867430686951, "learning_rate": 4.1371020142949965e-05, "loss": 0.14, "step": 6640 }, { "epoch": 0.8641975308641975, "grad_norm": 0.4404042363166809, "learning_rate": 4.135802469135803e-05, "loss": 0.2495, "step": 6650 }, { "epoch": 0.8654970760233918, "grad_norm": 0.38911929726600647, "learning_rate": 4.134502923976608e-05, "loss": 0.2884, "step": 6660 }, { "epoch": 0.866796621182586, "grad_norm": 0.43760520219802856, "learning_rate": 4.133203378817414e-05, "loss": 0.1808, "step": 6670 }, { "epoch": 0.8680961663417803, "grad_norm": 0.3784533143043518, "learning_rate": 4.13190383365822e-05, "loss": 0.1838, "step": 6680 }, { "epoch": 0.8693957115009746, "grad_norm": 0.46482110023498535, "learning_rate": 4.130604288499025e-05, "loss": 0.2118, "step": 6690 }, { "epoch": 0.8706952566601689, "grad_norm": 0.4253760576248169, "learning_rate": 4.1293047433398313e-05, "loss": 0.2656, "step": 6700 }, { "epoch": 0.8719948018193632, "grad_norm": 0.49509796500205994, "learning_rate": 4.128005198180637e-05, "loss": 0.1977, "step": 6710 }, { "epoch": 0.8732943469785575, "grad_norm": 0.2593007981777191, "learning_rate": 4.1267056530214425e-05, "loss": 0.1506, "step": 6720 }, { "epoch": 0.8745938921377517, "grad_norm": 0.27944713830947876, "learning_rate": 4.125406107862249e-05, "loss": 0.2171, "step": 6730 }, { "epoch": 0.875893437296946, "grad_norm": 0.36388108134269714, "learning_rate": 4.124106562703054e-05, "loss": 0.1841, "step": 6740 }, { "epoch": 0.8771929824561403, "grad_norm": 0.25616294145584106, "learning_rate": 4.12280701754386e-05, "loss": 0.1983, "step": 6750 }, { "epoch": 0.8784925276153346, "grad_norm": 0.4525931179523468, "learning_rate": 4.1215074723846655e-05, "loss": 0.195, "step": 6760 }, { "epoch": 0.8797920727745289, "grad_norm": 0.19621138274669647, "learning_rate": 4.120207927225471e-05, "loss": 0.2065, "step": 6770 }, { "epoch": 0.8810916179337231, "grad_norm": 0.39651018381118774, "learning_rate": 4.1189083820662774e-05, "loss": 0.1731, "step": 6780 }, { "epoch": 0.8823911630929174, "grad_norm": 0.48281925916671753, "learning_rate": 4.117608836907082e-05, "loss": 0.2499, "step": 6790 }, { "epoch": 0.8836907082521117, "grad_norm": 0.9985212087631226, "learning_rate": 4.1163092917478886e-05, "loss": 0.2535, "step": 6800 }, { "epoch": 0.884990253411306, "grad_norm": 0.19346874952316284, "learning_rate": 4.115009746588694e-05, "loss": 0.159, "step": 6810 }, { "epoch": 0.8862897985705003, "grad_norm": 0.2993505299091339, "learning_rate": 4.1137102014295e-05, "loss": 0.1959, "step": 6820 }, { "epoch": 0.8875893437296946, "grad_norm": 0.5761911273002625, "learning_rate": 4.112410656270306e-05, "loss": 0.2867, "step": 6830 }, { "epoch": 0.8888888888888888, "grad_norm": 0.41034558415412903, "learning_rate": 4.111111111111111e-05, "loss": 0.2507, "step": 6840 }, { "epoch": 0.8901884340480831, "grad_norm": 0.3691641390323639, "learning_rate": 4.109811565951917e-05, "loss": 0.1624, "step": 6850 }, { "epoch": 0.8914879792072774, "grad_norm": 0.35357728600502014, "learning_rate": 4.108512020792723e-05, "loss": 0.2796, "step": 6860 }, { "epoch": 0.8927875243664717, "grad_norm": 0.497406929731369, "learning_rate": 4.107212475633528e-05, "loss": 0.1718, "step": 6870 }, { "epoch": 0.894087069525666, "grad_norm": 0.48522260785102844, "learning_rate": 4.1059129304743346e-05, "loss": 0.2669, "step": 6880 }, { "epoch": 0.8953866146848602, "grad_norm": 0.19809779524803162, "learning_rate": 4.1046133853151395e-05, "loss": 0.2025, "step": 6890 }, { "epoch": 0.8966861598440545, "grad_norm": 0.4282327890396118, "learning_rate": 4.103313840155946e-05, "loss": 0.1381, "step": 6900 }, { "epoch": 0.8979857050032488, "grad_norm": 0.35746249556541443, "learning_rate": 4.1020142949967514e-05, "loss": 0.2544, "step": 6910 }, { "epoch": 0.8992852501624431, "grad_norm": 0.25352248549461365, "learning_rate": 4.100714749837557e-05, "loss": 0.1563, "step": 6920 }, { "epoch": 0.9005847953216374, "grad_norm": 0.3709542155265808, "learning_rate": 4.099415204678363e-05, "loss": 0.1828, "step": 6930 }, { "epoch": 0.9018843404808317, "grad_norm": 1.118669033050537, "learning_rate": 4.098115659519168e-05, "loss": 0.2211, "step": 6940 }, { "epoch": 0.9031838856400259, "grad_norm": 0.7091696858406067, "learning_rate": 4.0968161143599744e-05, "loss": 0.2863, "step": 6950 }, { "epoch": 0.9044834307992202, "grad_norm": 0.5818303227424622, "learning_rate": 4.09551656920078e-05, "loss": 0.2292, "step": 6960 }, { "epoch": 0.9057829759584145, "grad_norm": 0.9587668776512146, "learning_rate": 4.0942170240415855e-05, "loss": 0.2214, "step": 6970 }, { "epoch": 0.9070825211176088, "grad_norm": 0.1284518986940384, "learning_rate": 4.092917478882392e-05, "loss": 0.1651, "step": 6980 }, { "epoch": 0.9083820662768031, "grad_norm": 0.3779367506504059, "learning_rate": 4.091617933723197e-05, "loss": 0.1744, "step": 6990 }, { "epoch": 0.9096816114359974, "grad_norm": 0.2542187571525574, "learning_rate": 4.090318388564003e-05, "loss": 0.1826, "step": 7000 }, { "epoch": 0.9109811565951916, "grad_norm": 0.25603288412094116, "learning_rate": 4.0890188434048086e-05, "loss": 0.1936, "step": 7010 }, { "epoch": 0.9122807017543859, "grad_norm": 0.27674391865730286, "learning_rate": 4.087719298245614e-05, "loss": 0.2362, "step": 7020 }, { "epoch": 0.9135802469135802, "grad_norm": 0.3878907561302185, "learning_rate": 4.0864197530864204e-05, "loss": 0.1752, "step": 7030 }, { "epoch": 0.9148797920727745, "grad_norm": 0.4440919756889343, "learning_rate": 4.085120207927225e-05, "loss": 0.2614, "step": 7040 }, { "epoch": 0.9161793372319688, "grad_norm": 0.18594194948673248, "learning_rate": 4.0838206627680316e-05, "loss": 0.2562, "step": 7050 }, { "epoch": 0.917478882391163, "grad_norm": 0.2770173251628876, "learning_rate": 4.082521117608837e-05, "loss": 0.1537, "step": 7060 }, { "epoch": 0.9187784275503573, "grad_norm": 0.3920951783657074, "learning_rate": 4.081221572449643e-05, "loss": 0.2313, "step": 7070 }, { "epoch": 0.9200779727095516, "grad_norm": 0.44636234641075134, "learning_rate": 4.079922027290449e-05, "loss": 0.1679, "step": 7080 }, { "epoch": 0.9213775178687459, "grad_norm": 0.6915001273155212, "learning_rate": 4.078622482131254e-05, "loss": 0.2777, "step": 7090 }, { "epoch": 0.9226770630279402, "grad_norm": 0.41541507840156555, "learning_rate": 4.07732293697206e-05, "loss": 0.1899, "step": 7100 }, { "epoch": 0.9239766081871345, "grad_norm": 0.4200087785720825, "learning_rate": 4.076023391812866e-05, "loss": 0.219, "step": 7110 }, { "epoch": 0.9252761533463287, "grad_norm": 0.7210721969604492, "learning_rate": 4.0747238466536714e-05, "loss": 0.2038, "step": 7120 }, { "epoch": 0.926575698505523, "grad_norm": 0.3006529211997986, "learning_rate": 4.0734243014944776e-05, "loss": 0.1412, "step": 7130 }, { "epoch": 0.9278752436647173, "grad_norm": 0.3770562410354614, "learning_rate": 4.0721247563352825e-05, "loss": 0.2538, "step": 7140 }, { "epoch": 0.9291747888239116, "grad_norm": 0.9460674524307251, "learning_rate": 4.070825211176089e-05, "loss": 0.171, "step": 7150 }, { "epoch": 0.9304743339831059, "grad_norm": 0.3650246560573578, "learning_rate": 4.0695256660168944e-05, "loss": 0.2509, "step": 7160 }, { "epoch": 0.9317738791423001, "grad_norm": 0.1892554610967636, "learning_rate": 4.0682261208577e-05, "loss": 0.2628, "step": 7170 }, { "epoch": 0.9330734243014944, "grad_norm": 0.7520748376846313, "learning_rate": 4.0669265756985056e-05, "loss": 0.1691, "step": 7180 }, { "epoch": 0.9343729694606887, "grad_norm": 0.38146114349365234, "learning_rate": 4.065627030539311e-05, "loss": 0.162, "step": 7190 }, { "epoch": 0.935672514619883, "grad_norm": 0.19431529939174652, "learning_rate": 4.0643274853801174e-05, "loss": 0.224, "step": 7200 }, { "epoch": 0.9369720597790773, "grad_norm": 0.31227338314056396, "learning_rate": 4.063027940220923e-05, "loss": 0.1495, "step": 7210 }, { "epoch": 0.9382716049382716, "grad_norm": 0.5447232127189636, "learning_rate": 4.0617283950617286e-05, "loss": 0.2343, "step": 7220 }, { "epoch": 0.9395711500974658, "grad_norm": 0.37811630964279175, "learning_rate": 4.060428849902534e-05, "loss": 0.172, "step": 7230 }, { "epoch": 0.9408706952566601, "grad_norm": 0.3671533167362213, "learning_rate": 4.05912930474334e-05, "loss": 0.2233, "step": 7240 }, { "epoch": 0.9421702404158544, "grad_norm": 0.5382195115089417, "learning_rate": 4.057829759584146e-05, "loss": 0.1843, "step": 7250 }, { "epoch": 0.9434697855750487, "grad_norm": 0.5836758613586426, "learning_rate": 4.0565302144249516e-05, "loss": 0.1927, "step": 7260 }, { "epoch": 0.944769330734243, "grad_norm": 0.6249738335609436, "learning_rate": 4.055230669265757e-05, "loss": 0.1926, "step": 7270 }, { "epoch": 0.9460688758934372, "grad_norm": 0.5033667087554932, "learning_rate": 4.053931124106563e-05, "loss": 0.1567, "step": 7280 }, { "epoch": 0.9473684210526315, "grad_norm": 0.4246724545955658, "learning_rate": 4.0526315789473684e-05, "loss": 0.2045, "step": 7290 }, { "epoch": 0.9486679662118258, "grad_norm": 0.40300866961479187, "learning_rate": 4.0513320337881746e-05, "loss": 0.1672, "step": 7300 }, { "epoch": 0.9499675113710201, "grad_norm": 0.5183202624320984, "learning_rate": 4.05003248862898e-05, "loss": 0.1799, "step": 7310 }, { "epoch": 0.9512670565302144, "grad_norm": 0.6221484541893005, "learning_rate": 4.048732943469786e-05, "loss": 0.1982, "step": 7320 }, { "epoch": 0.9525666016894087, "grad_norm": 0.5879034996032715, "learning_rate": 4.0474333983105914e-05, "loss": 0.177, "step": 7330 }, { "epoch": 0.9538661468486029, "grad_norm": 1.0734976530075073, "learning_rate": 4.046133853151397e-05, "loss": 0.1913, "step": 7340 }, { "epoch": 0.9551656920077972, "grad_norm": 0.2798386514186859, "learning_rate": 4.044834307992203e-05, "loss": 0.2161, "step": 7350 }, { "epoch": 0.9564652371669915, "grad_norm": 0.1793740838766098, "learning_rate": 4.043534762833009e-05, "loss": 0.1828, "step": 7360 }, { "epoch": 0.9577647823261858, "grad_norm": 0.6812098622322083, "learning_rate": 4.0422352176738144e-05, "loss": 0.2377, "step": 7370 }, { "epoch": 0.9590643274853801, "grad_norm": 0.7494365572929382, "learning_rate": 4.04093567251462e-05, "loss": 0.2243, "step": 7380 }, { "epoch": 0.9603638726445743, "grad_norm": 0.520855188369751, "learning_rate": 4.0396361273554256e-05, "loss": 0.1944, "step": 7390 }, { "epoch": 0.9616634178037686, "grad_norm": 0.6524550318717957, "learning_rate": 4.038336582196232e-05, "loss": 0.2243, "step": 7400 }, { "epoch": 0.9629629629629629, "grad_norm": 0.4939538836479187, "learning_rate": 4.0370370370370374e-05, "loss": 0.1496, "step": 7410 }, { "epoch": 0.9642625081221572, "grad_norm": 1.0866429805755615, "learning_rate": 4.035737491877843e-05, "loss": 0.2461, "step": 7420 }, { "epoch": 0.9655620532813515, "grad_norm": 0.49759697914123535, "learning_rate": 4.0344379467186486e-05, "loss": 0.2156, "step": 7430 }, { "epoch": 0.9668615984405458, "grad_norm": 0.44099515676498413, "learning_rate": 4.033138401559454e-05, "loss": 0.1481, "step": 7440 }, { "epoch": 0.96816114359974, "grad_norm": 0.6560086011886597, "learning_rate": 4.0318388564002604e-05, "loss": 0.2205, "step": 7450 }, { "epoch": 0.9694606887589343, "grad_norm": 0.1928422600030899, "learning_rate": 4.030539311241066e-05, "loss": 0.2396, "step": 7460 }, { "epoch": 0.9707602339181286, "grad_norm": 0.3462390899658203, "learning_rate": 4.0292397660818716e-05, "loss": 0.1555, "step": 7470 }, { "epoch": 0.9720597790773229, "grad_norm": 0.5904410481452942, "learning_rate": 4.027940220922677e-05, "loss": 0.2449, "step": 7480 }, { "epoch": 0.9733593242365172, "grad_norm": 0.5947417616844177, "learning_rate": 4.026640675763483e-05, "loss": 0.2463, "step": 7490 }, { "epoch": 0.9746588693957114, "grad_norm": 0.5635711550712585, "learning_rate": 4.025341130604289e-05, "loss": 0.1968, "step": 7500 }, { "epoch": 0.9759584145549057, "grad_norm": 0.2512187361717224, "learning_rate": 4.0240415854450946e-05, "loss": 0.2023, "step": 7510 }, { "epoch": 0.9772579597141, "grad_norm": 0.5954657196998596, "learning_rate": 4.0227420402859e-05, "loss": 0.1824, "step": 7520 }, { "epoch": 0.9785575048732943, "grad_norm": 0.6840699911117554, "learning_rate": 4.021442495126706e-05, "loss": 0.2288, "step": 7530 }, { "epoch": 0.9798570500324886, "grad_norm": 0.7672845125198364, "learning_rate": 4.0201429499675114e-05, "loss": 0.2052, "step": 7540 }, { "epoch": 0.9811565951916829, "grad_norm": 0.4324817955493927, "learning_rate": 4.0188434048083176e-05, "loss": 0.1647, "step": 7550 }, { "epoch": 0.9824561403508771, "grad_norm": 0.6581873297691345, "learning_rate": 4.017543859649123e-05, "loss": 0.2746, "step": 7560 }, { "epoch": 0.9837556855100714, "grad_norm": 0.1580251157283783, "learning_rate": 4.016244314489929e-05, "loss": 0.2578, "step": 7570 }, { "epoch": 0.9850552306692657, "grad_norm": 0.5325599908828735, "learning_rate": 4.0149447693307344e-05, "loss": 0.2858, "step": 7580 }, { "epoch": 0.98635477582846, "grad_norm": 0.2965157628059387, "learning_rate": 4.01364522417154e-05, "loss": 0.2535, "step": 7590 }, { "epoch": 0.9876543209876543, "grad_norm": 0.7512301206588745, "learning_rate": 4.012345679012346e-05, "loss": 0.3065, "step": 7600 }, { "epoch": 0.9889538661468485, "grad_norm": 1.2121434211730957, "learning_rate": 4.011046133853152e-05, "loss": 0.2675, "step": 7610 }, { "epoch": 0.9902534113060428, "grad_norm": 0.5856961607933044, "learning_rate": 4.0097465886939574e-05, "loss": 0.1323, "step": 7620 }, { "epoch": 0.9915529564652371, "grad_norm": 0.18814370036125183, "learning_rate": 4.008447043534763e-05, "loss": 0.1808, "step": 7630 }, { "epoch": 0.9928525016244314, "grad_norm": 0.1658950001001358, "learning_rate": 4.0071474983755686e-05, "loss": 0.2363, "step": 7640 }, { "epoch": 0.9941520467836257, "grad_norm": 0.43226999044418335, "learning_rate": 4.005847953216375e-05, "loss": 0.1792, "step": 7650 }, { "epoch": 0.99545159194282, "grad_norm": 0.19470569491386414, "learning_rate": 4.0045484080571804e-05, "loss": 0.2321, "step": 7660 }, { "epoch": 0.9967511371020142, "grad_norm": 0.28112614154815674, "learning_rate": 4.003248862897986e-05, "loss": 0.1999, "step": 7670 }, { "epoch": 0.9980506822612085, "grad_norm": 0.6221048831939697, "learning_rate": 4.0019493177387916e-05, "loss": 0.2602, "step": 7680 }, { "epoch": 0.9993502274204028, "grad_norm": 0.36631008982658386, "learning_rate": 4.000649772579597e-05, "loss": 0.1974, "step": 7690 }, { "epoch": 1.0, "eval_loss": 0.1614588350057602, "eval_runtime": 854.901, "eval_samples_per_second": 9.001, "eval_steps_per_second": 9.001, "step": 7695 }, { "epoch": 1.000649772579597, "grad_norm": 0.2758638858795166, "learning_rate": 3.999350227420403e-05, "loss": 0.1758, "step": 7700 }, { "epoch": 1.0019493177387915, "grad_norm": 0.1429820954799652, "learning_rate": 3.9980506822612084e-05, "loss": 0.1488, "step": 7710 }, { "epoch": 1.0032488628979856, "grad_norm": 0.836826503276825, "learning_rate": 3.9967511371020146e-05, "loss": 0.192, "step": 7720 }, { "epoch": 1.00454840805718, "grad_norm": 0.42049139738082886, "learning_rate": 3.99545159194282e-05, "loss": 0.2083, "step": 7730 }, { "epoch": 1.0058479532163742, "grad_norm": 0.16713564097881317, "learning_rate": 3.994152046783626e-05, "loss": 0.1284, "step": 7740 }, { "epoch": 1.0071474983755686, "grad_norm": 0.9132192134857178, "learning_rate": 3.9928525016244314e-05, "loss": 0.2668, "step": 7750 }, { "epoch": 1.0084470435347628, "grad_norm": 0.6017425060272217, "learning_rate": 3.991552956465237e-05, "loss": 0.1932, "step": 7760 }, { "epoch": 1.0097465886939572, "grad_norm": 3.0272817611694336, "learning_rate": 3.990253411306043e-05, "loss": 0.1636, "step": 7770 }, { "epoch": 1.0110461338531513, "grad_norm": 0.25496551394462585, "learning_rate": 3.988953866146849e-05, "loss": 0.1696, "step": 7780 }, { "epoch": 1.0123456790123457, "grad_norm": 0.4101819694042206, "learning_rate": 3.9876543209876544e-05, "loss": 0.1807, "step": 7790 }, { "epoch": 1.01364522417154, "grad_norm": 0.7568045854568481, "learning_rate": 3.98635477582846e-05, "loss": 0.1471, "step": 7800 }, { "epoch": 1.0149447693307343, "grad_norm": 0.4302207827568054, "learning_rate": 3.9850552306692656e-05, "loss": 0.1806, "step": 7810 }, { "epoch": 1.0162443144899285, "grad_norm": 0.24327093362808228, "learning_rate": 3.983755685510072e-05, "loss": 0.1521, "step": 7820 }, { "epoch": 1.0175438596491229, "grad_norm": 0.40655753016471863, "learning_rate": 3.9824561403508774e-05, "loss": 0.2334, "step": 7830 }, { "epoch": 1.018843404808317, "grad_norm": 0.9085299372673035, "learning_rate": 3.981156595191683e-05, "loss": 0.1954, "step": 7840 }, { "epoch": 1.0201429499675114, "grad_norm": 0.6893976330757141, "learning_rate": 3.9798570500324886e-05, "loss": 0.2025, "step": 7850 }, { "epoch": 1.0214424951267056, "grad_norm": 0.5733660459518433, "learning_rate": 3.978557504873294e-05, "loss": 0.1694, "step": 7860 }, { "epoch": 1.0227420402859, "grad_norm": 0.7230072617530823, "learning_rate": 3.9772579597141004e-05, "loss": 0.1628, "step": 7870 }, { "epoch": 1.0240415854450942, "grad_norm": 0.19591017067432404, "learning_rate": 3.975958414554906e-05, "loss": 0.1359, "step": 7880 }, { "epoch": 1.0253411306042886, "grad_norm": 0.24206438660621643, "learning_rate": 3.9746588693957116e-05, "loss": 0.1633, "step": 7890 }, { "epoch": 1.0266406757634827, "grad_norm": 0.45750102400779724, "learning_rate": 3.973359324236517e-05, "loss": 0.2035, "step": 7900 }, { "epoch": 1.0279402209226771, "grad_norm": 0.26918065547943115, "learning_rate": 3.972059779077323e-05, "loss": 0.1103, "step": 7910 }, { "epoch": 1.0292397660818713, "grad_norm": 0.5304617285728455, "learning_rate": 3.970760233918129e-05, "loss": 0.1888, "step": 7920 }, { "epoch": 1.0305393112410657, "grad_norm": 0.39123448729515076, "learning_rate": 3.9694606887589346e-05, "loss": 0.1773, "step": 7930 }, { "epoch": 1.0318388564002599, "grad_norm": 0.45290616154670715, "learning_rate": 3.96816114359974e-05, "loss": 0.1523, "step": 7940 }, { "epoch": 1.0331384015594542, "grad_norm": 0.43001556396484375, "learning_rate": 3.966861598440546e-05, "loss": 0.1521, "step": 7950 }, { "epoch": 1.0344379467186484, "grad_norm": 0.7810167670249939, "learning_rate": 3.9655620532813514e-05, "loss": 0.2524, "step": 7960 }, { "epoch": 1.0357374918778428, "grad_norm": 0.501545786857605, "learning_rate": 3.9642625081221577e-05, "loss": 0.1585, "step": 7970 }, { "epoch": 1.037037037037037, "grad_norm": 0.39627644419670105, "learning_rate": 3.962962962962963e-05, "loss": 0.1652, "step": 7980 }, { "epoch": 1.0383365821962314, "grad_norm": 0.33287590742111206, "learning_rate": 3.961663417803769e-05, "loss": 0.1918, "step": 7990 }, { "epoch": 1.0396361273554255, "grad_norm": 0.4792992174625397, "learning_rate": 3.9603638726445744e-05, "loss": 0.0958, "step": 8000 }, { "epoch": 1.04093567251462, "grad_norm": 0.5145460367202759, "learning_rate": 3.95906432748538e-05, "loss": 0.1766, "step": 8010 }, { "epoch": 1.042235217673814, "grad_norm": 0.6790533661842346, "learning_rate": 3.957764782326186e-05, "loss": 0.1722, "step": 8020 }, { "epoch": 1.0435347628330085, "grad_norm": 0.2579961121082306, "learning_rate": 3.956465237166992e-05, "loss": 0.2394, "step": 8030 }, { "epoch": 1.0448343079922027, "grad_norm": 0.3909823000431061, "learning_rate": 3.9551656920077974e-05, "loss": 0.2079, "step": 8040 }, { "epoch": 1.046133853151397, "grad_norm": 0.25735795497894287, "learning_rate": 3.953866146848603e-05, "loss": 0.115, "step": 8050 }, { "epoch": 1.0474333983105912, "grad_norm": 0.3321128189563751, "learning_rate": 3.9525666016894086e-05, "loss": 0.1916, "step": 8060 }, { "epoch": 1.0487329434697856, "grad_norm": 0.2801649272441864, "learning_rate": 3.951267056530215e-05, "loss": 0.1916, "step": 8070 }, { "epoch": 1.0500324886289798, "grad_norm": 0.5579587817192078, "learning_rate": 3.9499675113710205e-05, "loss": 0.1963, "step": 8080 }, { "epoch": 1.0513320337881742, "grad_norm": 0.3526497185230255, "learning_rate": 3.948667966211826e-05, "loss": 0.1385, "step": 8090 }, { "epoch": 1.0526315789473684, "grad_norm": 0.4870572090148926, "learning_rate": 3.9473684210526316e-05, "loss": 0.1896, "step": 8100 }, { "epoch": 1.0539311241065628, "grad_norm": 0.30612537264823914, "learning_rate": 3.946068875893437e-05, "loss": 0.1903, "step": 8110 }, { "epoch": 1.055230669265757, "grad_norm": 0.807648241519928, "learning_rate": 3.9447693307342435e-05, "loss": 0.1706, "step": 8120 }, { "epoch": 1.0565302144249513, "grad_norm": 0.6151295900344849, "learning_rate": 3.943469785575049e-05, "loss": 0.1978, "step": 8130 }, { "epoch": 1.0578297595841455, "grad_norm": 0.2602250277996063, "learning_rate": 3.9421702404158546e-05, "loss": 0.1076, "step": 8140 }, { "epoch": 1.0591293047433399, "grad_norm": 0.27905598282814026, "learning_rate": 3.94087069525666e-05, "loss": 0.1589, "step": 8150 }, { "epoch": 1.060428849902534, "grad_norm": 0.48851117491722107, "learning_rate": 3.939571150097466e-05, "loss": 0.1489, "step": 8160 }, { "epoch": 1.0617283950617284, "grad_norm": 0.6958509087562561, "learning_rate": 3.938271604938272e-05, "loss": 0.1855, "step": 8170 }, { "epoch": 1.0630279402209226, "grad_norm": 0.5828195810317993, "learning_rate": 3.936972059779078e-05, "loss": 0.1671, "step": 8180 }, { "epoch": 1.064327485380117, "grad_norm": 0.7780759334564209, "learning_rate": 3.935672514619883e-05, "loss": 0.2322, "step": 8190 }, { "epoch": 1.0656270305393112, "grad_norm": 0.3231164813041687, "learning_rate": 3.934372969460689e-05, "loss": 0.1366, "step": 8200 }, { "epoch": 1.0669265756985056, "grad_norm": 0.6022560000419617, "learning_rate": 3.9330734243014944e-05, "loss": 0.201, "step": 8210 }, { "epoch": 1.0682261208576997, "grad_norm": 1.338723063468933, "learning_rate": 3.9317738791423e-05, "loss": 0.2907, "step": 8220 }, { "epoch": 1.0695256660168941, "grad_norm": 0.6001616716384888, "learning_rate": 3.930474333983106e-05, "loss": 0.2217, "step": 8230 }, { "epoch": 1.0708252111760883, "grad_norm": 0.36626872420310974, "learning_rate": 3.929174788823912e-05, "loss": 0.1538, "step": 8240 }, { "epoch": 1.0721247563352827, "grad_norm": 0.30771109461784363, "learning_rate": 3.9278752436647174e-05, "loss": 0.1513, "step": 8250 }, { "epoch": 1.0734243014944769, "grad_norm": 0.46677008271217346, "learning_rate": 3.926575698505523e-05, "loss": 0.1686, "step": 8260 }, { "epoch": 1.0747238466536713, "grad_norm": 0.690041720867157, "learning_rate": 3.9252761533463286e-05, "loss": 0.1628, "step": 8270 }, { "epoch": 1.0760233918128654, "grad_norm": 0.4476456940174103, "learning_rate": 3.923976608187135e-05, "loss": 0.1945, "step": 8280 }, { "epoch": 1.0773229369720598, "grad_norm": 0.17868691682815552, "learning_rate": 3.9226770630279405e-05, "loss": 0.161, "step": 8290 }, { "epoch": 1.078622482131254, "grad_norm": 0.48593881726264954, "learning_rate": 3.921377517868746e-05, "loss": 0.1769, "step": 8300 }, { "epoch": 1.0799220272904484, "grad_norm": 0.2049403190612793, "learning_rate": 3.9200779727095516e-05, "loss": 0.1425, "step": 8310 }, { "epoch": 1.0812215724496426, "grad_norm": 0.4582839608192444, "learning_rate": 3.918778427550357e-05, "loss": 0.2127, "step": 8320 }, { "epoch": 1.082521117608837, "grad_norm": 0.7715213298797607, "learning_rate": 3.9174788823911635e-05, "loss": 0.1583, "step": 8330 }, { "epoch": 1.0838206627680311, "grad_norm": 0.45168742537498474, "learning_rate": 3.916179337231969e-05, "loss": 0.2669, "step": 8340 }, { "epoch": 1.0851202079272255, "grad_norm": 0.41766372323036194, "learning_rate": 3.9148797920727747e-05, "loss": 0.1584, "step": 8350 }, { "epoch": 1.0864197530864197, "grad_norm": 0.30281469225883484, "learning_rate": 3.91358024691358e-05, "loss": 0.1672, "step": 8360 }, { "epoch": 1.087719298245614, "grad_norm": 0.9641382098197937, "learning_rate": 3.912280701754386e-05, "loss": 0.2074, "step": 8370 }, { "epoch": 1.0890188434048083, "grad_norm": 0.23389996588230133, "learning_rate": 3.910981156595192e-05, "loss": 0.2199, "step": 8380 }, { "epoch": 1.0903183885640026, "grad_norm": 0.364706426858902, "learning_rate": 3.909681611435998e-05, "loss": 0.1902, "step": 8390 }, { "epoch": 1.0916179337231968, "grad_norm": 0.38523611426353455, "learning_rate": 3.908382066276803e-05, "loss": 0.1633, "step": 8400 }, { "epoch": 1.0929174788823912, "grad_norm": 0.2055005580186844, "learning_rate": 3.907082521117609e-05, "loss": 0.1258, "step": 8410 }, { "epoch": 1.0942170240415854, "grad_norm": 0.5550917983055115, "learning_rate": 3.9057829759584144e-05, "loss": 0.1343, "step": 8420 }, { "epoch": 1.0955165692007798, "grad_norm": 0.23851877450942993, "learning_rate": 3.904483430799221e-05, "loss": 0.1492, "step": 8430 }, { "epoch": 1.096816114359974, "grad_norm": 0.5098744034767151, "learning_rate": 3.903183885640026e-05, "loss": 0.1894, "step": 8440 }, { "epoch": 1.0981156595191683, "grad_norm": 1.1725133657455444, "learning_rate": 3.901884340480832e-05, "loss": 0.2155, "step": 8450 }, { "epoch": 1.0994152046783625, "grad_norm": 0.8481301069259644, "learning_rate": 3.9005847953216374e-05, "loss": 0.159, "step": 8460 }, { "epoch": 1.100714749837557, "grad_norm": 0.3184190094470978, "learning_rate": 3.899285250162443e-05, "loss": 0.2112, "step": 8470 }, { "epoch": 1.102014294996751, "grad_norm": 0.7829905152320862, "learning_rate": 3.897985705003249e-05, "loss": 0.1895, "step": 8480 }, { "epoch": 1.1033138401559455, "grad_norm": 0.4919956922531128, "learning_rate": 3.896686159844055e-05, "loss": 0.2378, "step": 8490 }, { "epoch": 1.1046133853151396, "grad_norm": 0.2313760221004486, "learning_rate": 3.8953866146848605e-05, "loss": 0.1734, "step": 8500 }, { "epoch": 1.105912930474334, "grad_norm": 0.20121128857135773, "learning_rate": 3.894087069525666e-05, "loss": 0.1343, "step": 8510 }, { "epoch": 1.1072124756335282, "grad_norm": 0.3259910047054291, "learning_rate": 3.8927875243664716e-05, "loss": 0.16, "step": 8520 }, { "epoch": 1.1085120207927226, "grad_norm": 1.0704034566879272, "learning_rate": 3.891487979207278e-05, "loss": 0.1702, "step": 8530 }, { "epoch": 1.1098115659519168, "grad_norm": 0.2182963788509369, "learning_rate": 3.8901884340480835e-05, "loss": 0.1263, "step": 8540 }, { "epoch": 1.1111111111111112, "grad_norm": 0.3261566460132599, "learning_rate": 3.888888888888889e-05, "loss": 0.2432, "step": 8550 }, { "epoch": 1.1124106562703053, "grad_norm": 0.33483636379241943, "learning_rate": 3.8875893437296947e-05, "loss": 0.1707, "step": 8560 }, { "epoch": 1.1137102014294997, "grad_norm": 0.7208685874938965, "learning_rate": 3.8862897985705e-05, "loss": 0.1752, "step": 8570 }, { "epoch": 1.115009746588694, "grad_norm": 0.7477616667747498, "learning_rate": 3.8849902534113065e-05, "loss": 0.2223, "step": 8580 }, { "epoch": 1.1163092917478883, "grad_norm": 0.45955953001976013, "learning_rate": 3.883690708252112e-05, "loss": 0.1913, "step": 8590 }, { "epoch": 1.1176088369070825, "grad_norm": 0.25223296880722046, "learning_rate": 3.882391163092918e-05, "loss": 0.2362, "step": 8600 }, { "epoch": 1.1189083820662769, "grad_norm": 0.3057113289833069, "learning_rate": 3.881091617933723e-05, "loss": 0.1536, "step": 8610 }, { "epoch": 1.120207927225471, "grad_norm": 0.23131121695041656, "learning_rate": 3.879792072774529e-05, "loss": 0.1669, "step": 8620 }, { "epoch": 1.1215074723846654, "grad_norm": 1.2183253765106201, "learning_rate": 3.878492527615335e-05, "loss": 0.2513, "step": 8630 }, { "epoch": 1.1228070175438596, "grad_norm": 0.16605877876281738, "learning_rate": 3.877192982456141e-05, "loss": 0.1804, "step": 8640 }, { "epoch": 1.124106562703054, "grad_norm": 0.8810831904411316, "learning_rate": 3.875893437296946e-05, "loss": 0.2589, "step": 8650 }, { "epoch": 1.1254061078622482, "grad_norm": 2.9173800945281982, "learning_rate": 3.874593892137752e-05, "loss": 0.1471, "step": 8660 }, { "epoch": 1.1267056530214425, "grad_norm": 0.5857443809509277, "learning_rate": 3.8732943469785575e-05, "loss": 0.1528, "step": 8670 }, { "epoch": 1.1280051981806367, "grad_norm": 0.2771231234073639, "learning_rate": 3.871994801819364e-05, "loss": 0.1356, "step": 8680 }, { "epoch": 1.129304743339831, "grad_norm": 0.5000995993614197, "learning_rate": 3.8706952566601686e-05, "loss": 0.1829, "step": 8690 }, { "epoch": 1.1306042884990253, "grad_norm": 0.4226920008659363, "learning_rate": 3.869395711500975e-05, "loss": 0.1794, "step": 8700 }, { "epoch": 1.1319038336582197, "grad_norm": 0.521743893623352, "learning_rate": 3.8680961663417805e-05, "loss": 0.1465, "step": 8710 }, { "epoch": 1.1332033788174138, "grad_norm": 0.35894185304641724, "learning_rate": 3.866796621182586e-05, "loss": 0.1384, "step": 8720 }, { "epoch": 1.1345029239766082, "grad_norm": 0.38715144991874695, "learning_rate": 3.865497076023392e-05, "loss": 0.1556, "step": 8730 }, { "epoch": 1.1358024691358024, "grad_norm": 0.47103163599967957, "learning_rate": 3.864197530864197e-05, "loss": 0.2157, "step": 8740 }, { "epoch": 1.1371020142949968, "grad_norm": 1.0344867706298828, "learning_rate": 3.8628979857050035e-05, "loss": 0.2052, "step": 8750 }, { "epoch": 1.138401559454191, "grad_norm": 0.25414779782295227, "learning_rate": 3.861598440545809e-05, "loss": 0.1076, "step": 8760 }, { "epoch": 1.1397011046133854, "grad_norm": 0.2816448211669922, "learning_rate": 3.860298895386615e-05, "loss": 0.1873, "step": 8770 }, { "epoch": 1.1410006497725795, "grad_norm": 0.3432437777519226, "learning_rate": 3.858999350227421e-05, "loss": 0.1718, "step": 8780 }, { "epoch": 1.142300194931774, "grad_norm": 0.2586353123188019, "learning_rate": 3.857699805068226e-05, "loss": 0.1433, "step": 8790 }, { "epoch": 1.143599740090968, "grad_norm": 0.32992932200431824, "learning_rate": 3.856400259909032e-05, "loss": 0.2565, "step": 8800 }, { "epoch": 1.1448992852501625, "grad_norm": 0.21069279313087463, "learning_rate": 3.855100714749838e-05, "loss": 0.1838, "step": 8810 }, { "epoch": 1.1461988304093567, "grad_norm": 0.3348062336444855, "learning_rate": 3.853801169590643e-05, "loss": 0.1841, "step": 8820 }, { "epoch": 1.147498375568551, "grad_norm": 0.7765102982521057, "learning_rate": 3.8525016244314495e-05, "loss": 0.1874, "step": 8830 }, { "epoch": 1.1487979207277452, "grad_norm": 0.15246964991092682, "learning_rate": 3.8512020792722544e-05, "loss": 0.2233, "step": 8840 }, { "epoch": 1.1500974658869396, "grad_norm": 0.583118200302124, "learning_rate": 3.849902534113061e-05, "loss": 0.1544, "step": 8850 }, { "epoch": 1.1513970110461338, "grad_norm": 0.49248841404914856, "learning_rate": 3.848602988953866e-05, "loss": 0.1985, "step": 8860 }, { "epoch": 1.1526965562053282, "grad_norm": 0.7757649421691895, "learning_rate": 3.847303443794672e-05, "loss": 0.2626, "step": 8870 }, { "epoch": 1.1539961013645224, "grad_norm": 0.3953228294849396, "learning_rate": 3.846003898635478e-05, "loss": 0.225, "step": 8880 }, { "epoch": 1.1552956465237167, "grad_norm": 0.23284228146076202, "learning_rate": 3.844704353476283e-05, "loss": 0.2635, "step": 8890 }, { "epoch": 1.156595191682911, "grad_norm": 0.5884006023406982, "learning_rate": 3.843404808317089e-05, "loss": 0.2209, "step": 8900 }, { "epoch": 1.1578947368421053, "grad_norm": 0.5917836427688599, "learning_rate": 3.842105263157895e-05, "loss": 0.165, "step": 8910 }, { "epoch": 1.1591942820012995, "grad_norm": 0.5666766166687012, "learning_rate": 3.8408057179987005e-05, "loss": 0.4229, "step": 8920 }, { "epoch": 1.1604938271604939, "grad_norm": 0.32064709067344666, "learning_rate": 3.839506172839507e-05, "loss": 0.2156, "step": 8930 }, { "epoch": 1.161793372319688, "grad_norm": 0.5510064363479614, "learning_rate": 3.8382066276803117e-05, "loss": 0.1284, "step": 8940 }, { "epoch": 1.1630929174788824, "grad_norm": 0.8954606056213379, "learning_rate": 3.836907082521118e-05, "loss": 0.2376, "step": 8950 }, { "epoch": 1.1643924626380766, "grad_norm": 0.4816662073135376, "learning_rate": 3.8356075373619235e-05, "loss": 0.2102, "step": 8960 }, { "epoch": 1.165692007797271, "grad_norm": 0.34296101331710815, "learning_rate": 3.834307992202729e-05, "loss": 0.1519, "step": 8970 }, { "epoch": 1.1669915529564652, "grad_norm": 1.168250560760498, "learning_rate": 3.8330084470435354e-05, "loss": 0.2125, "step": 8980 }, { "epoch": 1.1682910981156596, "grad_norm": 0.6790671944618225, "learning_rate": 3.83170890188434e-05, "loss": 0.177, "step": 8990 }, { "epoch": 1.1695906432748537, "grad_norm": 0.39817142486572266, "learning_rate": 3.8304093567251465e-05, "loss": 0.1526, "step": 9000 }, { "epoch": 1.1708901884340481, "grad_norm": 0.5401949286460876, "learning_rate": 3.829109811565952e-05, "loss": 0.1396, "step": 9010 }, { "epoch": 1.1721897335932423, "grad_norm": 0.31111380457878113, "learning_rate": 3.827810266406758e-05, "loss": 0.2028, "step": 9020 }, { "epoch": 1.1734892787524367, "grad_norm": 0.39672335982322693, "learning_rate": 3.826510721247564e-05, "loss": 0.1374, "step": 9030 }, { "epoch": 1.1747888239116309, "grad_norm": 0.4857043921947479, "learning_rate": 3.825211176088369e-05, "loss": 0.1578, "step": 9040 }, { "epoch": 1.1760883690708253, "grad_norm": 0.3504287600517273, "learning_rate": 3.823911630929175e-05, "loss": 0.2036, "step": 9050 }, { "epoch": 1.1773879142300194, "grad_norm": 0.34342873096466064, "learning_rate": 3.822612085769981e-05, "loss": 0.1722, "step": 9060 }, { "epoch": 1.1786874593892138, "grad_norm": 0.5448178052902222, "learning_rate": 3.821312540610786e-05, "loss": 0.1433, "step": 9070 }, { "epoch": 1.179987004548408, "grad_norm": 0.30345460772514343, "learning_rate": 3.8200129954515926e-05, "loss": 0.1507, "step": 9080 }, { "epoch": 1.1812865497076024, "grad_norm": 0.1706763207912445, "learning_rate": 3.8187134502923975e-05, "loss": 0.1532, "step": 9090 }, { "epoch": 1.1825860948667966, "grad_norm": 0.09062011539936066, "learning_rate": 3.817413905133204e-05, "loss": 0.2266, "step": 9100 }, { "epoch": 1.183885640025991, "grad_norm": 0.3944757580757141, "learning_rate": 3.816114359974009e-05, "loss": 0.1994, "step": 9110 }, { "epoch": 1.1851851851851851, "grad_norm": 0.20048794150352478, "learning_rate": 3.814814814814815e-05, "loss": 0.1597, "step": 9120 }, { "epoch": 1.1864847303443795, "grad_norm": 0.39486053586006165, "learning_rate": 3.813515269655621e-05, "loss": 0.1707, "step": 9130 }, { "epoch": 1.1877842755035737, "grad_norm": 0.4164906442165375, "learning_rate": 3.812215724496426e-05, "loss": 0.1141, "step": 9140 }, { "epoch": 1.189083820662768, "grad_norm": 0.22045524418354034, "learning_rate": 3.8109161793372323e-05, "loss": 0.1476, "step": 9150 }, { "epoch": 1.1903833658219622, "grad_norm": 0.5243862867355347, "learning_rate": 3.809616634178038e-05, "loss": 0.1426, "step": 9160 }, { "epoch": 1.1916829109811566, "grad_norm": 0.7770912051200867, "learning_rate": 3.8083170890188435e-05, "loss": 0.2092, "step": 9170 }, { "epoch": 1.1929824561403508, "grad_norm": 0.6492583155632019, "learning_rate": 3.80701754385965e-05, "loss": 0.1811, "step": 9180 }, { "epoch": 1.1942820012995452, "grad_norm": 0.8172019124031067, "learning_rate": 3.805717998700455e-05, "loss": 0.1792, "step": 9190 }, { "epoch": 1.1955815464587394, "grad_norm": 0.7283622026443481, "learning_rate": 3.804418453541261e-05, "loss": 0.2531, "step": 9200 }, { "epoch": 1.1968810916179338, "grad_norm": 0.42448538541793823, "learning_rate": 3.803118908382066e-05, "loss": 0.2092, "step": 9210 }, { "epoch": 1.198180636777128, "grad_norm": 0.5133404731750488, "learning_rate": 3.801819363222872e-05, "loss": 0.1716, "step": 9220 }, { "epoch": 1.1994801819363223, "grad_norm": 0.704412043094635, "learning_rate": 3.8005198180636784e-05, "loss": 0.2281, "step": 9230 }, { "epoch": 1.2007797270955165, "grad_norm": 0.5823032855987549, "learning_rate": 3.799220272904483e-05, "loss": 0.1081, "step": 9240 }, { "epoch": 1.202079272254711, "grad_norm": 0.615127444267273, "learning_rate": 3.7979207277452896e-05, "loss": 0.2741, "step": 9250 }, { "epoch": 1.203378817413905, "grad_norm": 0.3175019323825836, "learning_rate": 3.7966211825860945e-05, "loss": 0.1569, "step": 9260 }, { "epoch": 1.2046783625730995, "grad_norm": 0.5858662128448486, "learning_rate": 3.795321637426901e-05, "loss": 0.2304, "step": 9270 }, { "epoch": 1.2059779077322936, "grad_norm": 0.5989826917648315, "learning_rate": 3.794022092267707e-05, "loss": 0.1967, "step": 9280 }, { "epoch": 1.207277452891488, "grad_norm": 0.5095000863075256, "learning_rate": 3.792722547108512e-05, "loss": 0.1764, "step": 9290 }, { "epoch": 1.2085769980506822, "grad_norm": 0.15232309699058533, "learning_rate": 3.791423001949318e-05, "loss": 0.1495, "step": 9300 }, { "epoch": 1.2098765432098766, "grad_norm": 0.4068877696990967, "learning_rate": 3.790123456790123e-05, "loss": 0.1691, "step": 9310 }, { "epoch": 1.2111760883690708, "grad_norm": 0.3750016391277313, "learning_rate": 3.788823911630929e-05, "loss": 0.1125, "step": 9320 }, { "epoch": 1.2124756335282652, "grad_norm": 0.4318859279155731, "learning_rate": 3.787524366471735e-05, "loss": 0.2271, "step": 9330 }, { "epoch": 1.2137751786874593, "grad_norm": 0.311366468667984, "learning_rate": 3.7862248213125405e-05, "loss": 0.1543, "step": 9340 }, { "epoch": 1.2150747238466537, "grad_norm": 0.8184641599655151, "learning_rate": 3.784925276153347e-05, "loss": 0.2028, "step": 9350 }, { "epoch": 1.2163742690058479, "grad_norm": 0.38286092877388, "learning_rate": 3.783625730994152e-05, "loss": 0.1796, "step": 9360 }, { "epoch": 1.2176738141650423, "grad_norm": 0.257458359003067, "learning_rate": 3.782326185834958e-05, "loss": 0.1851, "step": 9370 }, { "epoch": 1.2189733593242364, "grad_norm": 0.22059135138988495, "learning_rate": 3.7810266406757635e-05, "loss": 0.197, "step": 9380 }, { "epoch": 1.2202729044834308, "grad_norm": 0.37548762559890747, "learning_rate": 3.779727095516569e-05, "loss": 0.1732, "step": 9390 }, { "epoch": 1.221572449642625, "grad_norm": 0.8877992033958435, "learning_rate": 3.7784275503573754e-05, "loss": 0.2235, "step": 9400 }, { "epoch": 1.2228719948018194, "grad_norm": 0.15519185364246368, "learning_rate": 3.77712800519818e-05, "loss": 0.1662, "step": 9410 }, { "epoch": 1.2241715399610136, "grad_norm": 0.22009357810020447, "learning_rate": 3.7758284600389865e-05, "loss": 0.1247, "step": 9420 }, { "epoch": 1.225471085120208, "grad_norm": 0.466604620218277, "learning_rate": 3.774528914879792e-05, "loss": 0.1821, "step": 9430 }, { "epoch": 1.2267706302794021, "grad_norm": 0.509846031665802, "learning_rate": 3.773229369720598e-05, "loss": 0.1869, "step": 9440 }, { "epoch": 1.2280701754385965, "grad_norm": 0.32255542278289795, "learning_rate": 3.771929824561404e-05, "loss": 0.1711, "step": 9450 }, { "epoch": 1.2293697205977907, "grad_norm": 0.21253439784049988, "learning_rate": 3.770630279402209e-05, "loss": 0.1764, "step": 9460 }, { "epoch": 1.230669265756985, "grad_norm": 1.357151746749878, "learning_rate": 3.769330734243015e-05, "loss": 0.244, "step": 9470 }, { "epoch": 1.2319688109161793, "grad_norm": 0.2772939205169678, "learning_rate": 3.768031189083821e-05, "loss": 0.1784, "step": 9480 }, { "epoch": 1.2332683560753737, "grad_norm": 0.7557883262634277, "learning_rate": 3.766731643924626e-05, "loss": 0.2209, "step": 9490 }, { "epoch": 1.2345679012345678, "grad_norm": 0.49565836787223816, "learning_rate": 3.7654320987654326e-05, "loss": 0.2105, "step": 9500 }, { "epoch": 1.2358674463937622, "grad_norm": 0.40260589122772217, "learning_rate": 3.7641325536062375e-05, "loss": 0.1702, "step": 9510 }, { "epoch": 1.2371669915529564, "grad_norm": 0.2834109961986542, "learning_rate": 3.762833008447044e-05, "loss": 0.2435, "step": 9520 }, { "epoch": 1.2384665367121508, "grad_norm": 0.47542792558670044, "learning_rate": 3.761533463287849e-05, "loss": 0.1647, "step": 9530 }, { "epoch": 1.239766081871345, "grad_norm": 0.33072176575660706, "learning_rate": 3.760233918128655e-05, "loss": 0.17, "step": 9540 }, { "epoch": 1.2410656270305394, "grad_norm": 0.332745760679245, "learning_rate": 3.758934372969461e-05, "loss": 0.1511, "step": 9550 }, { "epoch": 1.2423651721897335, "grad_norm": 0.3059347867965698, "learning_rate": 3.757634827810266e-05, "loss": 0.1813, "step": 9560 }, { "epoch": 1.243664717348928, "grad_norm": 1.111210823059082, "learning_rate": 3.7563352826510724e-05, "loss": 0.1673, "step": 9570 }, { "epoch": 1.244964262508122, "grad_norm": 0.1372235119342804, "learning_rate": 3.755035737491878e-05, "loss": 0.1849, "step": 9580 }, { "epoch": 1.2462638076673165, "grad_norm": 0.5306878685951233, "learning_rate": 3.7537361923326835e-05, "loss": 0.1467, "step": 9590 }, { "epoch": 1.2475633528265107, "grad_norm": 0.5232819318771362, "learning_rate": 3.75243664717349e-05, "loss": 0.1922, "step": 9600 }, { "epoch": 1.248862897985705, "grad_norm": 0.941021740436554, "learning_rate": 3.751137102014295e-05, "loss": 0.2999, "step": 9610 }, { "epoch": 1.2501624431448992, "grad_norm": 0.4733368158340454, "learning_rate": 3.749837556855101e-05, "loss": 0.1371, "step": 9620 }, { "epoch": 1.2514619883040936, "grad_norm": 0.4709605872631073, "learning_rate": 3.7485380116959065e-05, "loss": 0.2155, "step": 9630 }, { "epoch": 1.2527615334632878, "grad_norm": 0.24718764424324036, "learning_rate": 3.747238466536712e-05, "loss": 0.1478, "step": 9640 }, { "epoch": 1.2540610786224822, "grad_norm": 0.24938267469406128, "learning_rate": 3.7459389213775184e-05, "loss": 0.1469, "step": 9650 }, { "epoch": 1.2553606237816763, "grad_norm": 0.20109039545059204, "learning_rate": 3.744639376218323e-05, "loss": 0.1619, "step": 9660 }, { "epoch": 1.2566601689408707, "grad_norm": 0.4879132807254791, "learning_rate": 3.7433398310591296e-05, "loss": 0.1554, "step": 9670 }, { "epoch": 1.257959714100065, "grad_norm": 0.814301073551178, "learning_rate": 3.742040285899935e-05, "loss": 0.1602, "step": 9680 }, { "epoch": 1.2592592592592593, "grad_norm": 1.2616009712219238, "learning_rate": 3.740740740740741e-05, "loss": 0.1803, "step": 9690 }, { "epoch": 1.2605588044184535, "grad_norm": 0.6122356653213501, "learning_rate": 3.739441195581547e-05, "loss": 0.1511, "step": 9700 }, { "epoch": 1.2618583495776479, "grad_norm": 0.6841140985488892, "learning_rate": 3.738141650422352e-05, "loss": 0.2053, "step": 9710 }, { "epoch": 1.263157894736842, "grad_norm": 0.6597102284431458, "learning_rate": 3.736842105263158e-05, "loss": 0.2058, "step": 9720 }, { "epoch": 1.2644574398960364, "grad_norm": 0.7387487292289734, "learning_rate": 3.735542560103964e-05, "loss": 0.2164, "step": 9730 }, { "epoch": 1.2657569850552306, "grad_norm": 0.2584919035434723, "learning_rate": 3.7342430149447693e-05, "loss": 0.0953, "step": 9740 }, { "epoch": 1.267056530214425, "grad_norm": 0.390354722738266, "learning_rate": 3.7329434697855756e-05, "loss": 0.2646, "step": 9750 }, { "epoch": 1.2683560753736192, "grad_norm": 0.30132752656936646, "learning_rate": 3.7316439246263805e-05, "loss": 0.1612, "step": 9760 }, { "epoch": 1.2696556205328136, "grad_norm": 0.2203841358423233, "learning_rate": 3.730344379467187e-05, "loss": 0.1226, "step": 9770 }, { "epoch": 1.2709551656920077, "grad_norm": 0.3471860885620117, "learning_rate": 3.7290448343079924e-05, "loss": 0.2699, "step": 9780 }, { "epoch": 1.2722547108512021, "grad_norm": 0.6315646767616272, "learning_rate": 3.727745289148798e-05, "loss": 0.1638, "step": 9790 }, { "epoch": 1.2735542560103963, "grad_norm": 0.8011904358863831, "learning_rate": 3.726445743989604e-05, "loss": 0.2557, "step": 9800 }, { "epoch": 1.2748538011695907, "grad_norm": 0.22181391716003418, "learning_rate": 3.725146198830409e-05, "loss": 0.1887, "step": 9810 }, { "epoch": 1.2761533463287849, "grad_norm": 0.5147697329521179, "learning_rate": 3.7238466536712154e-05, "loss": 0.2576, "step": 9820 }, { "epoch": 1.2774528914879792, "grad_norm": 0.5389986038208008, "learning_rate": 3.722547108512021e-05, "loss": 0.172, "step": 9830 }, { "epoch": 1.2787524366471734, "grad_norm": 0.5199617147445679, "learning_rate": 3.7212475633528266e-05, "loss": 0.1677, "step": 9840 }, { "epoch": 1.2800519818063678, "grad_norm": 0.2729426920413971, "learning_rate": 3.719948018193632e-05, "loss": 0.2739, "step": 9850 }, { "epoch": 1.281351526965562, "grad_norm": 0.49428701400756836, "learning_rate": 3.718648473034438e-05, "loss": 0.1146, "step": 9860 }, { "epoch": 1.2826510721247564, "grad_norm": 0.43753939867019653, "learning_rate": 3.717348927875244e-05, "loss": 0.2239, "step": 9870 }, { "epoch": 1.2839506172839505, "grad_norm": 0.43069466948509216, "learning_rate": 3.7160493827160496e-05, "loss": 0.1496, "step": 9880 }, { "epoch": 1.285250162443145, "grad_norm": 0.6861243844032288, "learning_rate": 3.714749837556855e-05, "loss": 0.2146, "step": 9890 }, { "epoch": 1.286549707602339, "grad_norm": 0.3587533235549927, "learning_rate": 3.713450292397661e-05, "loss": 0.2001, "step": 9900 }, { "epoch": 1.2878492527615335, "grad_norm": 0.3496533930301666, "learning_rate": 3.712150747238466e-05, "loss": 0.2374, "step": 9910 }, { "epoch": 1.2891487979207277, "grad_norm": 0.640613853931427, "learning_rate": 3.7108512020792726e-05, "loss": 0.186, "step": 9920 }, { "epoch": 1.290448343079922, "grad_norm": 0.3684648871421814, "learning_rate": 3.709551656920078e-05, "loss": 0.1947, "step": 9930 }, { "epoch": 1.2917478882391162, "grad_norm": 0.4065145254135132, "learning_rate": 3.708252111760884e-05, "loss": 0.155, "step": 9940 }, { "epoch": 1.2930474333983106, "grad_norm": 0.6955223679542542, "learning_rate": 3.7069525666016894e-05, "loss": 0.1791, "step": 9950 }, { "epoch": 1.2943469785575048, "grad_norm": 0.3872520923614502, "learning_rate": 3.705653021442495e-05, "loss": 0.2043, "step": 9960 }, { "epoch": 1.2956465237166992, "grad_norm": 0.44727110862731934, "learning_rate": 3.704353476283301e-05, "loss": 0.1975, "step": 9970 }, { "epoch": 1.2969460688758934, "grad_norm": 0.39310571551322937, "learning_rate": 3.703053931124107e-05, "loss": 0.1743, "step": 9980 }, { "epoch": 1.2982456140350878, "grad_norm": 0.37417587637901306, "learning_rate": 3.7017543859649124e-05, "loss": 0.1902, "step": 9990 }, { "epoch": 1.299545159194282, "grad_norm": 0.2705618739128113, "learning_rate": 3.700454840805718e-05, "loss": 0.1609, "step": 10000 }, { "epoch": 1.3008447043534763, "grad_norm": 0.4996832609176636, "learning_rate": 3.6991552956465235e-05, "loss": 0.1805, "step": 10010 }, { "epoch": 1.3021442495126705, "grad_norm": 0.24369771778583527, "learning_rate": 3.69785575048733e-05, "loss": 0.1143, "step": 10020 }, { "epoch": 1.3034437946718649, "grad_norm": 0.8451298475265503, "learning_rate": 3.6965562053281354e-05, "loss": 0.2088, "step": 10030 }, { "epoch": 1.304743339831059, "grad_norm": 0.3018137812614441, "learning_rate": 3.695256660168941e-05, "loss": 0.1913, "step": 10040 }, { "epoch": 1.3060428849902534, "grad_norm": 0.13434632122516632, "learning_rate": 3.6939571150097466e-05, "loss": 0.188, "step": 10050 }, { "epoch": 1.3073424301494476, "grad_norm": 0.4705439507961273, "learning_rate": 3.692657569850552e-05, "loss": 0.1367, "step": 10060 }, { "epoch": 1.308641975308642, "grad_norm": 0.33203256130218506, "learning_rate": 3.6913580246913584e-05, "loss": 0.2542, "step": 10070 }, { "epoch": 1.3099415204678362, "grad_norm": 0.14473789930343628, "learning_rate": 3.690058479532164e-05, "loss": 0.224, "step": 10080 }, { "epoch": 1.3112410656270306, "grad_norm": 0.16044795513153076, "learning_rate": 3.6887589343729696e-05, "loss": 0.1546, "step": 10090 }, { "epoch": 1.3125406107862247, "grad_norm": 0.3405941426753998, "learning_rate": 3.687459389213775e-05, "loss": 0.2225, "step": 10100 }, { "epoch": 1.3138401559454191, "grad_norm": 0.2912195026874542, "learning_rate": 3.686159844054581e-05, "loss": 0.1376, "step": 10110 }, { "epoch": 1.3151397011046133, "grad_norm": 0.5326084494590759, "learning_rate": 3.684860298895387e-05, "loss": 0.1409, "step": 10120 }, { "epoch": 1.3164392462638077, "grad_norm": 0.38725873827934265, "learning_rate": 3.6835607537361926e-05, "loss": 0.1784, "step": 10130 }, { "epoch": 1.3177387914230019, "grad_norm": 0.6634486317634583, "learning_rate": 3.682261208576998e-05, "loss": 0.1759, "step": 10140 }, { "epoch": 1.3190383365821963, "grad_norm": 0.15838885307312012, "learning_rate": 3.680961663417804e-05, "loss": 0.164, "step": 10150 }, { "epoch": 1.3203378817413904, "grad_norm": 0.19534489512443542, "learning_rate": 3.6796621182586094e-05, "loss": 0.1826, "step": 10160 }, { "epoch": 1.3216374269005848, "grad_norm": 0.25820982456207275, "learning_rate": 3.6783625730994156e-05, "loss": 0.1676, "step": 10170 }, { "epoch": 1.322936972059779, "grad_norm": 0.28103718161582947, "learning_rate": 3.677063027940221e-05, "loss": 0.1555, "step": 10180 }, { "epoch": 1.3242365172189734, "grad_norm": 0.41762980818748474, "learning_rate": 3.675763482781027e-05, "loss": 0.1958, "step": 10190 }, { "epoch": 1.3255360623781676, "grad_norm": 0.25611570477485657, "learning_rate": 3.6744639376218324e-05, "loss": 0.1592, "step": 10200 }, { "epoch": 1.326835607537362, "grad_norm": 0.659759521484375, "learning_rate": 3.673164392462638e-05, "loss": 0.2132, "step": 10210 }, { "epoch": 1.3281351526965561, "grad_norm": 0.2432621866464615, "learning_rate": 3.671864847303444e-05, "loss": 0.1726, "step": 10220 }, { "epoch": 1.3294346978557505, "grad_norm": 0.4458739161491394, "learning_rate": 3.67056530214425e-05, "loss": 0.1457, "step": 10230 }, { "epoch": 1.3307342430149447, "grad_norm": 1.0003643035888672, "learning_rate": 3.6692657569850554e-05, "loss": 0.1855, "step": 10240 }, { "epoch": 1.332033788174139, "grad_norm": 0.22885870933532715, "learning_rate": 3.667966211825861e-05, "loss": 0.1943, "step": 10250 }, { "epoch": 1.3333333333333333, "grad_norm": 0.6624867916107178, "learning_rate": 3.6666666666666666e-05, "loss": 0.1777, "step": 10260 }, { "epoch": 1.3346328784925277, "grad_norm": 1.1174222230911255, "learning_rate": 3.665367121507473e-05, "loss": 0.1812, "step": 10270 }, { "epoch": 1.3359324236517218, "grad_norm": 0.43146634101867676, "learning_rate": 3.6640675763482784e-05, "loss": 0.1607, "step": 10280 }, { "epoch": 1.3372319688109162, "grad_norm": 0.22149251401424408, "learning_rate": 3.662768031189084e-05, "loss": 0.1284, "step": 10290 }, { "epoch": 1.3385315139701104, "grad_norm": 0.43104633688926697, "learning_rate": 3.6614684860298896e-05, "loss": 0.2227, "step": 10300 }, { "epoch": 1.3398310591293048, "grad_norm": 0.3095886707305908, "learning_rate": 3.660168940870695e-05, "loss": 0.1956, "step": 10310 }, { "epoch": 1.341130604288499, "grad_norm": 0.9024443030357361, "learning_rate": 3.6588693957115014e-05, "loss": 0.178, "step": 10320 }, { "epoch": 1.3424301494476933, "grad_norm": 0.5690615177154541, "learning_rate": 3.657569850552307e-05, "loss": 0.1976, "step": 10330 }, { "epoch": 1.3437296946068875, "grad_norm": 0.36031824350357056, "learning_rate": 3.6562703053931126e-05, "loss": 0.1217, "step": 10340 }, { "epoch": 1.345029239766082, "grad_norm": 0.32017818093299866, "learning_rate": 3.654970760233918e-05, "loss": 0.2647, "step": 10350 }, { "epoch": 1.346328784925276, "grad_norm": 0.9640544652938843, "learning_rate": 3.653671215074724e-05, "loss": 0.2206, "step": 10360 }, { "epoch": 1.3476283300844705, "grad_norm": 0.6059637069702148, "learning_rate": 3.6523716699155294e-05, "loss": 0.2225, "step": 10370 }, { "epoch": 1.3489278752436646, "grad_norm": 0.5063768029212952, "learning_rate": 3.6510721247563356e-05, "loss": 0.189, "step": 10380 }, { "epoch": 1.350227420402859, "grad_norm": 0.37174665927886963, "learning_rate": 3.649772579597141e-05, "loss": 0.1897, "step": 10390 }, { "epoch": 1.3515269655620532, "grad_norm": 0.41024452447891235, "learning_rate": 3.648473034437947e-05, "loss": 0.1861, "step": 10400 }, { "epoch": 1.3528265107212476, "grad_norm": 0.3305908739566803, "learning_rate": 3.6471734892787524e-05, "loss": 0.2579, "step": 10410 }, { "epoch": 1.3541260558804418, "grad_norm": 0.6731358170509338, "learning_rate": 3.645873944119558e-05, "loss": 0.1851, "step": 10420 }, { "epoch": 1.3554256010396362, "grad_norm": 0.2706781029701233, "learning_rate": 3.644574398960364e-05, "loss": 0.164, "step": 10430 }, { "epoch": 1.3567251461988303, "grad_norm": 0.38808348774909973, "learning_rate": 3.64327485380117e-05, "loss": 0.14, "step": 10440 }, { "epoch": 1.3580246913580247, "grad_norm": 0.4063732922077179, "learning_rate": 3.6419753086419754e-05, "loss": 0.1651, "step": 10450 }, { "epoch": 1.359324236517219, "grad_norm": 0.2759503722190857, "learning_rate": 3.640675763482781e-05, "loss": 0.2104, "step": 10460 }, { "epoch": 1.3606237816764133, "grad_norm": 0.557923436164856, "learning_rate": 3.6393762183235866e-05, "loss": 0.1947, "step": 10470 }, { "epoch": 1.3619233268356075, "grad_norm": 0.2511429488658905, "learning_rate": 3.638076673164393e-05, "loss": 0.2827, "step": 10480 }, { "epoch": 1.3632228719948019, "grad_norm": 0.45019203424453735, "learning_rate": 3.6367771280051984e-05, "loss": 0.1797, "step": 10490 }, { "epoch": 1.364522417153996, "grad_norm": 0.4718112647533417, "learning_rate": 3.635477582846004e-05, "loss": 0.1969, "step": 10500 }, { "epoch": 1.3658219623131904, "grad_norm": 0.23971958458423615, "learning_rate": 3.6341780376868096e-05, "loss": 0.1675, "step": 10510 }, { "epoch": 1.3671215074723846, "grad_norm": 0.5515020489692688, "learning_rate": 3.632878492527615e-05, "loss": 0.1827, "step": 10520 }, { "epoch": 1.368421052631579, "grad_norm": 0.3322247266769409, "learning_rate": 3.6315789473684214e-05, "loss": 0.1397, "step": 10530 }, { "epoch": 1.3697205977907732, "grad_norm": 0.48299649357795715, "learning_rate": 3.630279402209227e-05, "loss": 0.1766, "step": 10540 }, { "epoch": 1.3710201429499675, "grad_norm": 0.7215142250061035, "learning_rate": 3.6289798570500326e-05, "loss": 0.198, "step": 10550 }, { "epoch": 1.3723196881091617, "grad_norm": 0.8658707141876221, "learning_rate": 3.627680311890838e-05, "loss": 0.1639, "step": 10560 }, { "epoch": 1.373619233268356, "grad_norm": 0.3001706600189209, "learning_rate": 3.626380766731644e-05, "loss": 0.2249, "step": 10570 }, { "epoch": 1.3749187784275503, "grad_norm": 0.4459267854690552, "learning_rate": 3.62508122157245e-05, "loss": 0.168, "step": 10580 }, { "epoch": 1.3762183235867447, "grad_norm": 0.5016582012176514, "learning_rate": 3.6237816764132556e-05, "loss": 0.227, "step": 10590 }, { "epoch": 1.3775178687459388, "grad_norm": 0.2260386049747467, "learning_rate": 3.622482131254061e-05, "loss": 0.2669, "step": 10600 }, { "epoch": 1.3788174139051332, "grad_norm": 0.19836916029453278, "learning_rate": 3.621182586094867e-05, "loss": 0.1034, "step": 10610 }, { "epoch": 1.3801169590643274, "grad_norm": 0.20563043653964996, "learning_rate": 3.6198830409356724e-05, "loss": 0.2305, "step": 10620 }, { "epoch": 1.3814165042235218, "grad_norm": 0.5119601488113403, "learning_rate": 3.6185834957764787e-05, "loss": 0.1758, "step": 10630 }, { "epoch": 1.382716049382716, "grad_norm": 0.3191220164299011, "learning_rate": 3.617283950617284e-05, "loss": 0.1408, "step": 10640 }, { "epoch": 1.3840155945419104, "grad_norm": 0.736720860004425, "learning_rate": 3.61598440545809e-05, "loss": 0.22, "step": 10650 }, { "epoch": 1.3853151397011045, "grad_norm": 0.5634852647781372, "learning_rate": 3.6146848602988954e-05, "loss": 0.186, "step": 10660 }, { "epoch": 1.386614684860299, "grad_norm": 0.3244497776031494, "learning_rate": 3.613385315139701e-05, "loss": 0.2242, "step": 10670 }, { "epoch": 1.387914230019493, "grad_norm": 0.21400271356105804, "learning_rate": 3.612085769980507e-05, "loss": 0.0928, "step": 10680 }, { "epoch": 1.3892137751786875, "grad_norm": 0.2823992371559143, "learning_rate": 3.610786224821313e-05, "loss": 0.178, "step": 10690 }, { "epoch": 1.3905133203378817, "grad_norm": 0.5287518501281738, "learning_rate": 3.6094866796621184e-05, "loss": 0.1196, "step": 10700 }, { "epoch": 1.391812865497076, "grad_norm": 0.5703768730163574, "learning_rate": 3.608187134502924e-05, "loss": 0.1526, "step": 10710 }, { "epoch": 1.3931124106562702, "grad_norm": 0.6994734406471252, "learning_rate": 3.6068875893437296e-05, "loss": 0.1735, "step": 10720 }, { "epoch": 1.3944119558154646, "grad_norm": 0.3259182572364807, "learning_rate": 3.605588044184536e-05, "loss": 0.1478, "step": 10730 }, { "epoch": 1.3957115009746588, "grad_norm": 0.432827353477478, "learning_rate": 3.6042884990253415e-05, "loss": 0.2409, "step": 10740 }, { "epoch": 1.3970110461338532, "grad_norm": 0.756672203540802, "learning_rate": 3.602988953866147e-05, "loss": 0.1952, "step": 10750 }, { "epoch": 1.3983105912930474, "grad_norm": 0.3172270953655243, "learning_rate": 3.6016894087069526e-05, "loss": 0.1178, "step": 10760 }, { "epoch": 1.3996101364522417, "grad_norm": 0.4855130910873413, "learning_rate": 3.600389863547758e-05, "loss": 0.2054, "step": 10770 }, { "epoch": 1.400909681611436, "grad_norm": 1.3574531078338623, "learning_rate": 3.5990903183885645e-05, "loss": 0.2196, "step": 10780 }, { "epoch": 1.4022092267706303, "grad_norm": 0.46039021015167236, "learning_rate": 3.59779077322937e-05, "loss": 0.2028, "step": 10790 }, { "epoch": 1.4035087719298245, "grad_norm": 0.43146130442619324, "learning_rate": 3.5964912280701756e-05, "loss": 0.1858, "step": 10800 }, { "epoch": 1.4048083170890189, "grad_norm": 1.081356406211853, "learning_rate": 3.595191682910981e-05, "loss": 0.1696, "step": 10810 }, { "epoch": 1.406107862248213, "grad_norm": 0.19587157666683197, "learning_rate": 3.593892137751787e-05, "loss": 0.1553, "step": 10820 }, { "epoch": 1.4074074074074074, "grad_norm": 0.2932470738887787, "learning_rate": 3.592592592592593e-05, "loss": 0.2326, "step": 10830 }, { "epoch": 1.4087069525666016, "grad_norm": 0.4007440507411957, "learning_rate": 3.591293047433399e-05, "loss": 0.1679, "step": 10840 }, { "epoch": 1.410006497725796, "grad_norm": 0.38796529173851013, "learning_rate": 3.589993502274204e-05, "loss": 0.127, "step": 10850 }, { "epoch": 1.4113060428849902, "grad_norm": 0.4987604320049286, "learning_rate": 3.58869395711501e-05, "loss": 0.1436, "step": 10860 }, { "epoch": 1.4126055880441846, "grad_norm": 0.5756065249443054, "learning_rate": 3.5873944119558154e-05, "loss": 0.1855, "step": 10870 }, { "epoch": 1.4139051332033787, "grad_norm": 0.4227081537246704, "learning_rate": 3.586094866796622e-05, "loss": 0.1334, "step": 10880 }, { "epoch": 1.4152046783625731, "grad_norm": 0.6281440258026123, "learning_rate": 3.5847953216374266e-05, "loss": 0.1801, "step": 10890 }, { "epoch": 1.4165042235217673, "grad_norm": 0.47785207629203796, "learning_rate": 3.583495776478233e-05, "loss": 0.1639, "step": 10900 }, { "epoch": 1.4178037686809617, "grad_norm": 0.40663430094718933, "learning_rate": 3.5821962313190384e-05, "loss": 0.2397, "step": 10910 }, { "epoch": 1.4191033138401559, "grad_norm": 0.738479495048523, "learning_rate": 3.580896686159844e-05, "loss": 0.2398, "step": 10920 }, { "epoch": 1.4204028589993503, "grad_norm": 0.45509448647499084, "learning_rate": 3.57959714100065e-05, "loss": 0.2022, "step": 10930 }, { "epoch": 1.4217024041585444, "grad_norm": 0.3160908818244934, "learning_rate": 3.578297595841455e-05, "loss": 0.1717, "step": 10940 }, { "epoch": 1.4230019493177388, "grad_norm": 0.34352171421051025, "learning_rate": 3.5769980506822615e-05, "loss": 0.1643, "step": 10950 }, { "epoch": 1.424301494476933, "grad_norm": 0.4170229732990265, "learning_rate": 3.575698505523067e-05, "loss": 0.1425, "step": 10960 }, { "epoch": 1.4256010396361274, "grad_norm": 0.3680747151374817, "learning_rate": 3.5743989603638726e-05, "loss": 0.2305, "step": 10970 }, { "epoch": 1.4269005847953216, "grad_norm": 0.49217385053634644, "learning_rate": 3.573099415204679e-05, "loss": 0.1574, "step": 10980 }, { "epoch": 1.428200129954516, "grad_norm": 0.570225179195404, "learning_rate": 3.571799870045484e-05, "loss": 0.1564, "step": 10990 }, { "epoch": 1.4294996751137101, "grad_norm": 0.265756219625473, "learning_rate": 3.57050032488629e-05, "loss": 0.1653, "step": 11000 }, { "epoch": 1.4307992202729045, "grad_norm": 0.22573477029800415, "learning_rate": 3.5692007797270957e-05, "loss": 0.1616, "step": 11010 }, { "epoch": 1.4320987654320987, "grad_norm": 0.20332397520542145, "learning_rate": 3.567901234567901e-05, "loss": 0.1042, "step": 11020 }, { "epoch": 1.433398310591293, "grad_norm": 0.3129253089427948, "learning_rate": 3.5666016894087075e-05, "loss": 0.1741, "step": 11030 }, { "epoch": 1.4346978557504872, "grad_norm": 0.3634129464626312, "learning_rate": 3.5653021442495124e-05, "loss": 0.1586, "step": 11040 }, { "epoch": 1.4359974009096816, "grad_norm": 0.738804817199707, "learning_rate": 3.564002599090319e-05, "loss": 0.2145, "step": 11050 }, { "epoch": 1.4372969460688758, "grad_norm": 0.7082417011260986, "learning_rate": 3.562703053931124e-05, "loss": 0.278, "step": 11060 }, { "epoch": 1.4385964912280702, "grad_norm": 0.38500311970710754, "learning_rate": 3.56140350877193e-05, "loss": 0.1354, "step": 11070 }, { "epoch": 1.4398960363872644, "grad_norm": 0.23377950489521027, "learning_rate": 3.560103963612736e-05, "loss": 0.1518, "step": 11080 }, { "epoch": 1.4411955815464588, "grad_norm": 0.6920309066772461, "learning_rate": 3.558804418453541e-05, "loss": 0.1563, "step": 11090 }, { "epoch": 1.442495126705653, "grad_norm": 0.3090476095676422, "learning_rate": 3.557504873294347e-05, "loss": 0.1093, "step": 11100 }, { "epoch": 1.4437946718648473, "grad_norm": 0.14812569320201874, "learning_rate": 3.556205328135153e-05, "loss": 0.1964, "step": 11110 }, { "epoch": 1.4450942170240415, "grad_norm": 0.3720323443412781, "learning_rate": 3.5549057829759585e-05, "loss": 0.1359, "step": 11120 }, { "epoch": 1.446393762183236, "grad_norm": 0.27285268902778625, "learning_rate": 3.553606237816765e-05, "loss": 0.1672, "step": 11130 }, { "epoch": 1.44769330734243, "grad_norm": 0.3246764838695526, "learning_rate": 3.5523066926575696e-05, "loss": 0.1399, "step": 11140 }, { "epoch": 1.4489928525016245, "grad_norm": 0.4085937440395355, "learning_rate": 3.551007147498376e-05, "loss": 0.153, "step": 11150 }, { "epoch": 1.4502923976608186, "grad_norm": 0.3920857310295105, "learning_rate": 3.5497076023391815e-05, "loss": 0.1667, "step": 11160 }, { "epoch": 1.451591942820013, "grad_norm": 0.2936495840549469, "learning_rate": 3.548408057179987e-05, "loss": 0.1656, "step": 11170 }, { "epoch": 1.4528914879792072, "grad_norm": 0.12969644367694855, "learning_rate": 3.547108512020793e-05, "loss": 0.2307, "step": 11180 }, { "epoch": 1.4541910331384016, "grad_norm": 0.4015539288520813, "learning_rate": 3.545808966861598e-05, "loss": 0.2547, "step": 11190 }, { "epoch": 1.4554905782975958, "grad_norm": 0.8568450212478638, "learning_rate": 3.5445094217024045e-05, "loss": 0.1493, "step": 11200 }, { "epoch": 1.4567901234567902, "grad_norm": 0.12733733654022217, "learning_rate": 3.54320987654321e-05, "loss": 0.1315, "step": 11210 }, { "epoch": 1.4580896686159843, "grad_norm": 0.646072506904602, "learning_rate": 3.541910331384016e-05, "loss": 0.1673, "step": 11220 }, { "epoch": 1.4593892137751787, "grad_norm": 0.4034220278263092, "learning_rate": 3.540610786224822e-05, "loss": 0.1211, "step": 11230 }, { "epoch": 1.4606887589343729, "grad_norm": 0.5844655632972717, "learning_rate": 3.539311241065627e-05, "loss": 0.1754, "step": 11240 }, { "epoch": 1.4619883040935673, "grad_norm": 0.7309711575508118, "learning_rate": 3.538011695906433e-05, "loss": 0.1273, "step": 11250 }, { "epoch": 1.4632878492527615, "grad_norm": 0.46417078375816345, "learning_rate": 3.536712150747239e-05, "loss": 0.1782, "step": 11260 }, { "epoch": 1.4645873944119558, "grad_norm": 0.3854970335960388, "learning_rate": 3.535412605588044e-05, "loss": 0.2106, "step": 11270 }, { "epoch": 1.46588693957115, "grad_norm": 0.23411475121974945, "learning_rate": 3.5341130604288505e-05, "loss": 0.1207, "step": 11280 }, { "epoch": 1.4671864847303444, "grad_norm": 0.9760007262229919, "learning_rate": 3.5328135152696554e-05, "loss": 0.2246, "step": 11290 }, { "epoch": 1.4684860298895386, "grad_norm": 0.17484034597873688, "learning_rate": 3.531513970110462e-05, "loss": 0.1819, "step": 11300 }, { "epoch": 1.469785575048733, "grad_norm": 0.391835480928421, "learning_rate": 3.530214424951267e-05, "loss": 0.1502, "step": 11310 }, { "epoch": 1.4710851202079271, "grad_norm": 0.19608595967292786, "learning_rate": 3.528914879792073e-05, "loss": 0.184, "step": 11320 }, { "epoch": 1.4723846653671215, "grad_norm": 0.7705077528953552, "learning_rate": 3.527615334632879e-05, "loss": 0.2283, "step": 11330 }, { "epoch": 1.4736842105263157, "grad_norm": 0.7703973054885864, "learning_rate": 3.526315789473684e-05, "loss": 0.1831, "step": 11340 }, { "epoch": 1.47498375568551, "grad_norm": 0.43379703164100647, "learning_rate": 3.52501624431449e-05, "loss": 0.198, "step": 11350 }, { "epoch": 1.4762833008447043, "grad_norm": 0.47341614961624146, "learning_rate": 3.523716699155296e-05, "loss": 0.1282, "step": 11360 }, { "epoch": 1.4775828460038987, "grad_norm": 0.5054877400398254, "learning_rate": 3.5224171539961015e-05, "loss": 0.1955, "step": 11370 }, { "epoch": 1.4788823911630928, "grad_norm": 0.40437179803848267, "learning_rate": 3.521117608836908e-05, "loss": 0.1682, "step": 11380 }, { "epoch": 1.4801819363222872, "grad_norm": 0.3742484152317047, "learning_rate": 3.5198180636777126e-05, "loss": 0.1857, "step": 11390 }, { "epoch": 1.4814814814814814, "grad_norm": 0.29333969950675964, "learning_rate": 3.518518518518519e-05, "loss": 0.139, "step": 11400 }, { "epoch": 1.4827810266406758, "grad_norm": 0.5525131225585938, "learning_rate": 3.517218973359324e-05, "loss": 0.1669, "step": 11410 }, { "epoch": 1.48408057179987, "grad_norm": 0.5516939163208008, "learning_rate": 3.51591942820013e-05, "loss": 0.1574, "step": 11420 }, { "epoch": 1.4853801169590644, "grad_norm": 0.6622191667556763, "learning_rate": 3.5146198830409363e-05, "loss": 0.1838, "step": 11430 }, { "epoch": 1.4866796621182585, "grad_norm": 0.3459511399269104, "learning_rate": 3.513320337881741e-05, "loss": 0.1772, "step": 11440 }, { "epoch": 1.487979207277453, "grad_norm": 0.3778039813041687, "learning_rate": 3.5120207927225475e-05, "loss": 0.1078, "step": 11450 }, { "epoch": 1.489278752436647, "grad_norm": 0.38318824768066406, "learning_rate": 3.5107212475633524e-05, "loss": 0.1734, "step": 11460 }, { "epoch": 1.4905782975958415, "grad_norm": 0.7650538682937622, "learning_rate": 3.509421702404159e-05, "loss": 0.1649, "step": 11470 }, { "epoch": 1.4918778427550357, "grad_norm": 0.5227949023246765, "learning_rate": 3.508122157244965e-05, "loss": 0.1449, "step": 11480 }, { "epoch": 1.49317738791423, "grad_norm": 0.6355566382408142, "learning_rate": 3.50682261208577e-05, "loss": 0.1692, "step": 11490 }, { "epoch": 1.4944769330734242, "grad_norm": 0.6525735259056091, "learning_rate": 3.505523066926576e-05, "loss": 0.1849, "step": 11500 }, { "epoch": 1.4957764782326186, "grad_norm": 0.19508244097232819, "learning_rate": 3.504223521767381e-05, "loss": 0.1746, "step": 11510 }, { "epoch": 1.4970760233918128, "grad_norm": 0.800696849822998, "learning_rate": 3.502923976608187e-05, "loss": 0.2144, "step": 11520 }, { "epoch": 1.4983755685510072, "grad_norm": 1.301790714263916, "learning_rate": 3.501624431448993e-05, "loss": 0.1827, "step": 11530 }, { "epoch": 1.4996751137102013, "grad_norm": 0.20151278376579285, "learning_rate": 3.5003248862897985e-05, "loss": 0.1375, "step": 11540 }, { "epoch": 1.5009746588693957, "grad_norm": 0.33789098262786865, "learning_rate": 3.499025341130605e-05, "loss": 0.1898, "step": 11550 }, { "epoch": 1.50227420402859, "grad_norm": 0.23216113448143005, "learning_rate": 3.4977257959714096e-05, "loss": 0.1575, "step": 11560 }, { "epoch": 1.5035737491877843, "grad_norm": 0.3771279454231262, "learning_rate": 3.496426250812216e-05, "loss": 0.1979, "step": 11570 }, { "epoch": 1.5048732943469787, "grad_norm": 0.336507111787796, "learning_rate": 3.4951267056530215e-05, "loss": 0.1942, "step": 11580 }, { "epoch": 1.5061728395061729, "grad_norm": 0.42015349864959717, "learning_rate": 3.493827160493827e-05, "loss": 0.1597, "step": 11590 }, { "epoch": 1.507472384665367, "grad_norm": 0.25481823086738586, "learning_rate": 3.492527615334633e-05, "loss": 0.1294, "step": 11600 }, { "epoch": 1.5087719298245614, "grad_norm": 0.40943917632102966, "learning_rate": 3.491228070175438e-05, "loss": 0.2007, "step": 11610 }, { "epoch": 1.5100714749837558, "grad_norm": 1.0474193096160889, "learning_rate": 3.4899285250162445e-05, "loss": 0.1637, "step": 11620 }, { "epoch": 1.51137102014295, "grad_norm": 0.36315909028053284, "learning_rate": 3.48862897985705e-05, "loss": 0.1953, "step": 11630 }, { "epoch": 1.5126705653021442, "grad_norm": 0.637669563293457, "learning_rate": 3.487329434697856e-05, "loss": 0.1916, "step": 11640 }, { "epoch": 1.5139701104613386, "grad_norm": 0.3473416864871979, "learning_rate": 3.486029889538662e-05, "loss": 0.1863, "step": 11650 }, { "epoch": 1.515269655620533, "grad_norm": 0.5178897976875305, "learning_rate": 3.484730344379467e-05, "loss": 0.2091, "step": 11660 }, { "epoch": 1.5165692007797271, "grad_norm": 0.649094820022583, "learning_rate": 3.483430799220273e-05, "loss": 0.174, "step": 11670 }, { "epoch": 1.5178687459389213, "grad_norm": 0.32507067918777466, "learning_rate": 3.482131254061079e-05, "loss": 0.2415, "step": 11680 }, { "epoch": 1.5191682910981157, "grad_norm": 0.7923129796981812, "learning_rate": 3.480831708901884e-05, "loss": 0.1491, "step": 11690 }, { "epoch": 1.52046783625731, "grad_norm": 0.45553430914878845, "learning_rate": 3.4795321637426905e-05, "loss": 0.1466, "step": 11700 }, { "epoch": 1.5217673814165043, "grad_norm": 0.6135745644569397, "learning_rate": 3.4782326185834955e-05, "loss": 0.1365, "step": 11710 }, { "epoch": 1.5230669265756984, "grad_norm": 0.42142176628112793, "learning_rate": 3.476933073424302e-05, "loss": 0.2946, "step": 11720 }, { "epoch": 1.5243664717348928, "grad_norm": 1.0738581418991089, "learning_rate": 3.475633528265107e-05, "loss": 0.216, "step": 11730 }, { "epoch": 1.5256660168940872, "grad_norm": 0.509818434715271, "learning_rate": 3.474333983105913e-05, "loss": 0.1752, "step": 11740 }, { "epoch": 1.5269655620532814, "grad_norm": 0.6809357404708862, "learning_rate": 3.473034437946719e-05, "loss": 0.167, "step": 11750 }, { "epoch": 1.5282651072124755, "grad_norm": 0.2878524661064148, "learning_rate": 3.471734892787524e-05, "loss": 0.1321, "step": 11760 }, { "epoch": 1.52956465237167, "grad_norm": 0.2866867482662201, "learning_rate": 3.47043534762833e-05, "loss": 0.1984, "step": 11770 }, { "epoch": 1.5308641975308643, "grad_norm": 0.2139711081981659, "learning_rate": 3.469135802469136e-05, "loss": 0.1398, "step": 11780 }, { "epoch": 1.5321637426900585, "grad_norm": 0.43533170223236084, "learning_rate": 3.4678362573099415e-05, "loss": 0.1834, "step": 11790 }, { "epoch": 1.5334632878492527, "grad_norm": 0.6648089289665222, "learning_rate": 3.466536712150748e-05, "loss": 0.163, "step": 11800 }, { "epoch": 1.534762833008447, "grad_norm": 0.3327356278896332, "learning_rate": 3.465237166991553e-05, "loss": 0.1478, "step": 11810 }, { "epoch": 1.5360623781676415, "grad_norm": 0.7659869194030762, "learning_rate": 3.463937621832359e-05, "loss": 0.1354, "step": 11820 }, { "epoch": 1.5373619233268356, "grad_norm": 1.1264728307724, "learning_rate": 3.4626380766731645e-05, "loss": 0.2204, "step": 11830 }, { "epoch": 1.5386614684860298, "grad_norm": 0.72408527135849, "learning_rate": 3.46133853151397e-05, "loss": 0.1872, "step": 11840 }, { "epoch": 1.5399610136452242, "grad_norm": 0.6837506294250488, "learning_rate": 3.4600389863547764e-05, "loss": 0.194, "step": 11850 }, { "epoch": 1.5412605588044186, "grad_norm": 0.2667720913887024, "learning_rate": 3.458739441195581e-05, "loss": 0.1052, "step": 11860 }, { "epoch": 1.5425601039636128, "grad_norm": 0.2810719311237335, "learning_rate": 3.4574398960363875e-05, "loss": 0.2144, "step": 11870 }, { "epoch": 1.543859649122807, "grad_norm": 0.5749772191047668, "learning_rate": 3.456140350877193e-05, "loss": 0.1752, "step": 11880 }, { "epoch": 1.5451591942820013, "grad_norm": 0.7993298172950745, "learning_rate": 3.454840805717999e-05, "loss": 0.1796, "step": 11890 }, { "epoch": 1.5464587394411957, "grad_norm": 0.41337719559669495, "learning_rate": 3.453541260558805e-05, "loss": 0.1806, "step": 11900 }, { "epoch": 1.5477582846003899, "grad_norm": 0.4051779508590698, "learning_rate": 3.45224171539961e-05, "loss": 0.1671, "step": 11910 }, { "epoch": 1.549057829759584, "grad_norm": 0.476482629776001, "learning_rate": 3.450942170240416e-05, "loss": 0.164, "step": 11920 }, { "epoch": 1.5503573749187785, "grad_norm": 0.6457812786102295, "learning_rate": 3.449642625081222e-05, "loss": 0.1615, "step": 11930 }, { "epoch": 1.5516569200779728, "grad_norm": 0.4902561604976654, "learning_rate": 3.448343079922027e-05, "loss": 0.1959, "step": 11940 }, { "epoch": 1.552956465237167, "grad_norm": 0.7716822028160095, "learning_rate": 3.4470435347628336e-05, "loss": 0.1924, "step": 11950 }, { "epoch": 1.5542560103963612, "grad_norm": 0.44589677453041077, "learning_rate": 3.4457439896036385e-05, "loss": 0.1291, "step": 11960 }, { "epoch": 1.5555555555555556, "grad_norm": 0.3775405287742615, "learning_rate": 3.444444444444445e-05, "loss": 0.1435, "step": 11970 }, { "epoch": 1.55685510071475, "grad_norm": 0.7642008066177368, "learning_rate": 3.44314489928525e-05, "loss": 0.2653, "step": 11980 }, { "epoch": 1.5581546458739441, "grad_norm": 2.492141008377075, "learning_rate": 3.441845354126056e-05, "loss": 0.2245, "step": 11990 }, { "epoch": 1.5594541910331383, "grad_norm": 0.4467933773994446, "learning_rate": 3.440545808966862e-05, "loss": 0.2161, "step": 12000 }, { "epoch": 1.5607537361923327, "grad_norm": 0.7821152806282043, "learning_rate": 3.439246263807667e-05, "loss": 0.1696, "step": 12010 }, { "epoch": 1.562053281351527, "grad_norm": 0.24658717215061188, "learning_rate": 3.4379467186484734e-05, "loss": 0.1319, "step": 12020 }, { "epoch": 1.5633528265107213, "grad_norm": 1.0287492275238037, "learning_rate": 3.436647173489279e-05, "loss": 0.2062, "step": 12030 }, { "epoch": 1.5646523716699154, "grad_norm": 0.6793880462646484, "learning_rate": 3.4353476283300845e-05, "loss": 0.1546, "step": 12040 }, { "epoch": 1.5659519168291098, "grad_norm": 0.8436505794525146, "learning_rate": 3.43404808317089e-05, "loss": 0.1528, "step": 12050 }, { "epoch": 1.5672514619883042, "grad_norm": 0.544039249420166, "learning_rate": 3.432748538011696e-05, "loss": 0.1628, "step": 12060 }, { "epoch": 1.5685510071474984, "grad_norm": 0.1758148968219757, "learning_rate": 3.431448992852502e-05, "loss": 0.1551, "step": 12070 }, { "epoch": 1.5698505523066926, "grad_norm": 0.4410891532897949, "learning_rate": 3.4301494476933075e-05, "loss": 0.1531, "step": 12080 }, { "epoch": 1.571150097465887, "grad_norm": 0.36337634921073914, "learning_rate": 3.428849902534113e-05, "loss": 0.145, "step": 12090 }, { "epoch": 1.5724496426250814, "grad_norm": 0.9623821377754211, "learning_rate": 3.427550357374919e-05, "loss": 0.196, "step": 12100 }, { "epoch": 1.5737491877842755, "grad_norm": 0.5426864624023438, "learning_rate": 3.426250812215724e-05, "loss": 0.2246, "step": 12110 }, { "epoch": 1.5750487329434697, "grad_norm": 0.5581771731376648, "learning_rate": 3.4249512670565306e-05, "loss": 0.1271, "step": 12120 }, { "epoch": 1.576348278102664, "grad_norm": 0.11381722241640091, "learning_rate": 3.423651721897336e-05, "loss": 0.1084, "step": 12130 }, { "epoch": 1.5776478232618585, "grad_norm": 0.2362438291311264, "learning_rate": 3.422352176738142e-05, "loss": 0.1478, "step": 12140 }, { "epoch": 1.5789473684210527, "grad_norm": 0.3028627038002014, "learning_rate": 3.421052631578947e-05, "loss": 0.1298, "step": 12150 }, { "epoch": 1.5802469135802468, "grad_norm": 0.413841187953949, "learning_rate": 3.419753086419753e-05, "loss": 0.1099, "step": 12160 }, { "epoch": 1.5815464587394412, "grad_norm": 0.7470784783363342, "learning_rate": 3.418453541260559e-05, "loss": 0.1948, "step": 12170 }, { "epoch": 1.5828460038986356, "grad_norm": 0.9139882326126099, "learning_rate": 3.417153996101365e-05, "loss": 0.2148, "step": 12180 }, { "epoch": 1.5841455490578298, "grad_norm": 0.29564911127090454, "learning_rate": 3.41585445094217e-05, "loss": 0.1189, "step": 12190 }, { "epoch": 1.585445094217024, "grad_norm": 0.7872270345687866, "learning_rate": 3.414554905782976e-05, "loss": 0.1927, "step": 12200 }, { "epoch": 1.5867446393762183, "grad_norm": 0.7470902800559998, "learning_rate": 3.4132553606237815e-05, "loss": 0.2301, "step": 12210 }, { "epoch": 1.5880441845354127, "grad_norm": 0.3433152735233307, "learning_rate": 3.411955815464588e-05, "loss": 0.1911, "step": 12220 }, { "epoch": 1.589343729694607, "grad_norm": 0.5126696228981018, "learning_rate": 3.4106562703053934e-05, "loss": 0.1967, "step": 12230 }, { "epoch": 1.590643274853801, "grad_norm": 0.42439547181129456, "learning_rate": 3.409356725146199e-05, "loss": 0.1787, "step": 12240 }, { "epoch": 1.5919428200129955, "grad_norm": 0.3991709053516388, "learning_rate": 3.4080571799870045e-05, "loss": 0.1379, "step": 12250 }, { "epoch": 1.5932423651721899, "grad_norm": 0.5843124389648438, "learning_rate": 3.40675763482781e-05, "loss": 0.2363, "step": 12260 }, { "epoch": 1.594541910331384, "grad_norm": 0.535999059677124, "learning_rate": 3.4054580896686164e-05, "loss": 0.147, "step": 12270 }, { "epoch": 1.5958414554905782, "grad_norm": 0.5784700512886047, "learning_rate": 3.404158544509422e-05, "loss": 0.1844, "step": 12280 }, { "epoch": 1.5971410006497726, "grad_norm": 0.3100731670856476, "learning_rate": 3.4028589993502275e-05, "loss": 0.1337, "step": 12290 }, { "epoch": 1.598440545808967, "grad_norm": 0.23276396095752716, "learning_rate": 3.401559454191033e-05, "loss": 0.1554, "step": 12300 }, { "epoch": 1.5997400909681612, "grad_norm": 0.2279946357011795, "learning_rate": 3.400259909031839e-05, "loss": 0.1235, "step": 12310 }, { "epoch": 1.6010396361273553, "grad_norm": 0.4522683620452881, "learning_rate": 3.398960363872645e-05, "loss": 0.1466, "step": 12320 }, { "epoch": 1.6023391812865497, "grad_norm": 0.5488743185997009, "learning_rate": 3.3976608187134506e-05, "loss": 0.1643, "step": 12330 }, { "epoch": 1.6036387264457441, "grad_norm": 0.4346708059310913, "learning_rate": 3.396361273554256e-05, "loss": 0.2317, "step": 12340 }, { "epoch": 1.6049382716049383, "grad_norm": 0.2432001680135727, "learning_rate": 3.395061728395062e-05, "loss": 0.1914, "step": 12350 }, { "epoch": 1.6062378167641325, "grad_norm": 0.8149144053459167, "learning_rate": 3.393762183235867e-05, "loss": 0.152, "step": 12360 }, { "epoch": 1.6075373619233269, "grad_norm": 0.3214959502220154, "learning_rate": 3.3924626380766736e-05, "loss": 0.1851, "step": 12370 }, { "epoch": 1.6088369070825213, "grad_norm": 0.37735676765441895, "learning_rate": 3.391163092917479e-05, "loss": 0.18, "step": 12380 }, { "epoch": 1.6101364522417154, "grad_norm": 0.589512288570404, "learning_rate": 3.389863547758285e-05, "loss": 0.1771, "step": 12390 }, { "epoch": 1.6114359974009096, "grad_norm": 0.33603790402412415, "learning_rate": 3.3885640025990903e-05, "loss": 0.3232, "step": 12400 }, { "epoch": 1.612735542560104, "grad_norm": 0.3729575574398041, "learning_rate": 3.387264457439896e-05, "loss": 0.1133, "step": 12410 }, { "epoch": 1.6140350877192984, "grad_norm": 0.7995907664299011, "learning_rate": 3.385964912280702e-05, "loss": 0.1329, "step": 12420 }, { "epoch": 1.6153346328784925, "grad_norm": 0.457811176776886, "learning_rate": 3.384665367121508e-05, "loss": 0.1882, "step": 12430 }, { "epoch": 1.6166341780376867, "grad_norm": 0.440924733877182, "learning_rate": 3.3833658219623134e-05, "loss": 0.1579, "step": 12440 }, { "epoch": 1.6179337231968811, "grad_norm": 0.6113675832748413, "learning_rate": 3.382066276803119e-05, "loss": 0.218, "step": 12450 }, { "epoch": 1.6192332683560755, "grad_norm": 0.3294018507003784, "learning_rate": 3.3807667316439245e-05, "loss": 0.1137, "step": 12460 }, { "epoch": 1.6205328135152697, "grad_norm": 0.4335266351699829, "learning_rate": 3.379467186484731e-05, "loss": 0.2091, "step": 12470 }, { "epoch": 1.6218323586744638, "grad_norm": 0.6185824275016785, "learning_rate": 3.3781676413255364e-05, "loss": 0.1504, "step": 12480 }, { "epoch": 1.6231319038336582, "grad_norm": 0.1851561814546585, "learning_rate": 3.376868096166342e-05, "loss": 0.2221, "step": 12490 }, { "epoch": 1.6244314489928526, "grad_norm": 0.13491426408290863, "learning_rate": 3.3755685510071476e-05, "loss": 0.1772, "step": 12500 }, { "epoch": 1.6257309941520468, "grad_norm": 0.3857628405094147, "learning_rate": 3.374269005847953e-05, "loss": 0.1229, "step": 12510 }, { "epoch": 1.627030539311241, "grad_norm": 0.7678126096725464, "learning_rate": 3.3729694606887594e-05, "loss": 0.1902, "step": 12520 }, { "epoch": 1.6283300844704354, "grad_norm": 0.5558577179908752, "learning_rate": 3.371669915529565e-05, "loss": 0.281, "step": 12530 }, { "epoch": 1.6296296296296298, "grad_norm": 0.4347871243953705, "learning_rate": 3.3703703703703706e-05, "loss": 0.1489, "step": 12540 }, { "epoch": 1.630929174788824, "grad_norm": 0.9946538209915161, "learning_rate": 3.369070825211176e-05, "loss": 0.2262, "step": 12550 }, { "epoch": 1.632228719948018, "grad_norm": 0.237952321767807, "learning_rate": 3.367771280051982e-05, "loss": 0.1608, "step": 12560 }, { "epoch": 1.6335282651072125, "grad_norm": 0.3400883674621582, "learning_rate": 3.366471734892787e-05, "loss": 0.2075, "step": 12570 }, { "epoch": 1.6348278102664069, "grad_norm": 0.5055714845657349, "learning_rate": 3.3651721897335936e-05, "loss": 0.1405, "step": 12580 }, { "epoch": 1.636127355425601, "grad_norm": 0.42873185873031616, "learning_rate": 3.363872644574399e-05, "loss": 0.1405, "step": 12590 }, { "epoch": 1.6374269005847952, "grad_norm": 0.22622540593147278, "learning_rate": 3.362573099415205e-05, "loss": 0.1805, "step": 12600 }, { "epoch": 1.6387264457439896, "grad_norm": 0.6129465103149414, "learning_rate": 3.3612735542560104e-05, "loss": 0.1825, "step": 12610 }, { "epoch": 1.640025990903184, "grad_norm": 0.20284713804721832, "learning_rate": 3.359974009096816e-05, "loss": 0.1758, "step": 12620 }, { "epoch": 1.6413255360623782, "grad_norm": 0.6586829423904419, "learning_rate": 3.358674463937622e-05, "loss": 0.1669, "step": 12630 }, { "epoch": 1.6426250812215724, "grad_norm": 0.3911861181259155, "learning_rate": 3.357374918778428e-05, "loss": 0.1446, "step": 12640 }, { "epoch": 1.6439246263807668, "grad_norm": 0.29705753922462463, "learning_rate": 3.3560753736192334e-05, "loss": 0.1147, "step": 12650 }, { "epoch": 1.6452241715399611, "grad_norm": 0.1912851631641388, "learning_rate": 3.354775828460039e-05, "loss": 0.1829, "step": 12660 }, { "epoch": 1.6465237166991553, "grad_norm": 0.5710114240646362, "learning_rate": 3.3534762833008445e-05, "loss": 0.2711, "step": 12670 }, { "epoch": 1.6478232618583495, "grad_norm": 0.38864246010780334, "learning_rate": 3.352176738141651e-05, "loss": 0.1947, "step": 12680 }, { "epoch": 1.6491228070175439, "grad_norm": 0.3914055824279785, "learning_rate": 3.3508771929824564e-05, "loss": 0.1445, "step": 12690 }, { "epoch": 1.6504223521767383, "grad_norm": 0.4254039525985718, "learning_rate": 3.349577647823262e-05, "loss": 0.1766, "step": 12700 }, { "epoch": 1.6517218973359324, "grad_norm": 0.722698450088501, "learning_rate": 3.3482781026640676e-05, "loss": 0.242, "step": 12710 }, { "epoch": 1.6530214424951266, "grad_norm": 0.7755239605903625, "learning_rate": 3.346978557504873e-05, "loss": 0.1779, "step": 12720 }, { "epoch": 1.654320987654321, "grad_norm": 0.5166723132133484, "learning_rate": 3.3456790123456794e-05, "loss": 0.1741, "step": 12730 }, { "epoch": 1.6556205328135154, "grad_norm": 0.38875946402549744, "learning_rate": 3.344379467186485e-05, "loss": 0.1289, "step": 12740 }, { "epoch": 1.6569200779727096, "grad_norm": 0.7727816104888916, "learning_rate": 3.3430799220272906e-05, "loss": 0.1458, "step": 12750 }, { "epoch": 1.6582196231319037, "grad_norm": 0.672166645526886, "learning_rate": 3.341780376868096e-05, "loss": 0.1661, "step": 12760 }, { "epoch": 1.6595191682910981, "grad_norm": 0.8514589071273804, "learning_rate": 3.340480831708902e-05, "loss": 0.2098, "step": 12770 }, { "epoch": 1.6608187134502925, "grad_norm": 0.26438242197036743, "learning_rate": 3.339181286549708e-05, "loss": 0.2076, "step": 12780 }, { "epoch": 1.6621182586094867, "grad_norm": 0.31734591722488403, "learning_rate": 3.3378817413905136e-05, "loss": 0.1879, "step": 12790 }, { "epoch": 1.6634178037686809, "grad_norm": 0.754230797290802, "learning_rate": 3.336582196231319e-05, "loss": 0.2309, "step": 12800 }, { "epoch": 1.6647173489278753, "grad_norm": 0.24648337066173553, "learning_rate": 3.335282651072125e-05, "loss": 0.194, "step": 12810 }, { "epoch": 1.6660168940870697, "grad_norm": 0.698929488658905, "learning_rate": 3.3339831059129304e-05, "loss": 0.1869, "step": 12820 }, { "epoch": 1.6673164392462638, "grad_norm": 0.15393215417861938, "learning_rate": 3.3326835607537366e-05, "loss": 0.1339, "step": 12830 }, { "epoch": 1.668615984405458, "grad_norm": 0.25954535603523254, "learning_rate": 3.331384015594542e-05, "loss": 0.1208, "step": 12840 }, { "epoch": 1.6699155295646524, "grad_norm": 0.7581526637077332, "learning_rate": 3.330084470435348e-05, "loss": 0.1881, "step": 12850 }, { "epoch": 1.6712150747238468, "grad_norm": 0.9112092852592468, "learning_rate": 3.3287849252761534e-05, "loss": 0.2091, "step": 12860 }, { "epoch": 1.672514619883041, "grad_norm": 0.3279200494289398, "learning_rate": 3.327485380116959e-05, "loss": 0.2248, "step": 12870 }, { "epoch": 1.6738141650422351, "grad_norm": 0.5870591402053833, "learning_rate": 3.326185834957765e-05, "loss": 0.1851, "step": 12880 }, { "epoch": 1.6751137102014295, "grad_norm": 1.639194369316101, "learning_rate": 3.324886289798571e-05, "loss": 0.2684, "step": 12890 }, { "epoch": 1.676413255360624, "grad_norm": 0.5718706846237183, "learning_rate": 3.3235867446393764e-05, "loss": 0.1794, "step": 12900 }, { "epoch": 1.677712800519818, "grad_norm": 0.5392174124717712, "learning_rate": 3.322287199480182e-05, "loss": 0.1869, "step": 12910 }, { "epoch": 1.6790123456790123, "grad_norm": 0.8114726543426514, "learning_rate": 3.3209876543209876e-05, "loss": 0.2139, "step": 12920 }, { "epoch": 1.6803118908382066, "grad_norm": 0.8223268985748291, "learning_rate": 3.319688109161794e-05, "loss": 0.1615, "step": 12930 }, { "epoch": 1.681611435997401, "grad_norm": 0.6799668073654175, "learning_rate": 3.3183885640025994e-05, "loss": 0.1819, "step": 12940 }, { "epoch": 1.6829109811565952, "grad_norm": 0.2960383892059326, "learning_rate": 3.317089018843405e-05, "loss": 0.3184, "step": 12950 }, { "epoch": 1.6842105263157894, "grad_norm": 0.39749807119369507, "learning_rate": 3.3157894736842106e-05, "loss": 0.1287, "step": 12960 }, { "epoch": 1.6855100714749838, "grad_norm": 0.4407481551170349, "learning_rate": 3.314489928525016e-05, "loss": 0.1699, "step": 12970 }, { "epoch": 1.6868096166341782, "grad_norm": 0.34588822722435, "learning_rate": 3.3131903833658224e-05, "loss": 0.1199, "step": 12980 }, { "epoch": 1.6881091617933723, "grad_norm": 0.4418480396270752, "learning_rate": 3.311890838206628e-05, "loss": 0.1265, "step": 12990 }, { "epoch": 1.6894087069525665, "grad_norm": 0.5013694763183594, "learning_rate": 3.3105912930474336e-05, "loss": 0.1441, "step": 13000 }, { "epoch": 1.690708252111761, "grad_norm": 0.7744922637939453, "learning_rate": 3.309291747888239e-05, "loss": 0.1657, "step": 13010 }, { "epoch": 1.6920077972709553, "grad_norm": 0.4666323661804199, "learning_rate": 3.307992202729045e-05, "loss": 0.1537, "step": 13020 }, { "epoch": 1.6933073424301495, "grad_norm": 0.6727086305618286, "learning_rate": 3.306692657569851e-05, "loss": 0.153, "step": 13030 }, { "epoch": 1.6946068875893436, "grad_norm": 0.22604085505008698, "learning_rate": 3.3053931124106566e-05, "loss": 0.1848, "step": 13040 }, { "epoch": 1.695906432748538, "grad_norm": 0.37330031394958496, "learning_rate": 3.304093567251462e-05, "loss": 0.1864, "step": 13050 }, { "epoch": 1.6972059779077324, "grad_norm": 0.3415911793708801, "learning_rate": 3.302794022092268e-05, "loss": 0.177, "step": 13060 }, { "epoch": 1.6985055230669266, "grad_norm": 0.5671162009239197, "learning_rate": 3.3014944769330734e-05, "loss": 0.1827, "step": 13070 }, { "epoch": 1.6998050682261208, "grad_norm": 0.8758267760276794, "learning_rate": 3.3001949317738797e-05, "loss": 0.1721, "step": 13080 }, { "epoch": 1.7011046133853152, "grad_norm": 0.6692143082618713, "learning_rate": 3.2988953866146846e-05, "loss": 0.1941, "step": 13090 }, { "epoch": 1.7024041585445095, "grad_norm": 0.47053369879722595, "learning_rate": 3.297595841455491e-05, "loss": 0.1942, "step": 13100 }, { "epoch": 1.7037037037037037, "grad_norm": 0.5366320610046387, "learning_rate": 3.2962962962962964e-05, "loss": 0.1327, "step": 13110 }, { "epoch": 1.705003248862898, "grad_norm": 0.7886143922805786, "learning_rate": 3.294996751137102e-05, "loss": 0.2228, "step": 13120 }, { "epoch": 1.7063027940220923, "grad_norm": 0.30773016810417175, "learning_rate": 3.293697205977908e-05, "loss": 0.1326, "step": 13130 }, { "epoch": 1.7076023391812867, "grad_norm": 0.42208778858184814, "learning_rate": 3.292397660818713e-05, "loss": 0.1733, "step": 13140 }, { "epoch": 1.7089018843404808, "grad_norm": 0.46094638109207153, "learning_rate": 3.2910981156595194e-05, "loss": 0.1839, "step": 13150 }, { "epoch": 1.710201429499675, "grad_norm": 0.30078810453414917, "learning_rate": 3.289798570500325e-05, "loss": 0.1703, "step": 13160 }, { "epoch": 1.7115009746588694, "grad_norm": 0.38936856389045715, "learning_rate": 3.2884990253411306e-05, "loss": 0.1575, "step": 13170 }, { "epoch": 1.7128005198180638, "grad_norm": 0.5769753456115723, "learning_rate": 3.287199480181937e-05, "loss": 0.1485, "step": 13180 }, { "epoch": 1.714100064977258, "grad_norm": 0.6210539937019348, "learning_rate": 3.285899935022742e-05, "loss": 0.1884, "step": 13190 }, { "epoch": 1.7153996101364521, "grad_norm": 0.3266933560371399, "learning_rate": 3.284600389863548e-05, "loss": 0.2008, "step": 13200 }, { "epoch": 1.7166991552956465, "grad_norm": 0.9744528532028198, "learning_rate": 3.2833008447043536e-05, "loss": 0.2345, "step": 13210 }, { "epoch": 1.717998700454841, "grad_norm": 1.168868899345398, "learning_rate": 3.282001299545159e-05, "loss": 0.2219, "step": 13220 }, { "epoch": 1.719298245614035, "grad_norm": 0.3040392994880676, "learning_rate": 3.2807017543859655e-05, "loss": 0.1535, "step": 13230 }, { "epoch": 1.7205977907732293, "grad_norm": 0.2786175608634949, "learning_rate": 3.2794022092267704e-05, "loss": 0.1593, "step": 13240 }, { "epoch": 1.7218973359324237, "grad_norm": 0.15722952783107758, "learning_rate": 3.2781026640675766e-05, "loss": 0.1055, "step": 13250 }, { "epoch": 1.723196881091618, "grad_norm": 0.45786747336387634, "learning_rate": 3.276803118908382e-05, "loss": 0.2043, "step": 13260 }, { "epoch": 1.7244964262508122, "grad_norm": 0.31638896465301514, "learning_rate": 3.275503573749188e-05, "loss": 0.1714, "step": 13270 }, { "epoch": 1.7257959714100064, "grad_norm": 0.3338547348976135, "learning_rate": 3.274204028589994e-05, "loss": 0.2712, "step": 13280 }, { "epoch": 1.7270955165692008, "grad_norm": 0.41380077600479126, "learning_rate": 3.272904483430799e-05, "loss": 0.1744, "step": 13290 }, { "epoch": 1.7283950617283952, "grad_norm": 0.27394720911979675, "learning_rate": 3.271604938271605e-05, "loss": 0.1665, "step": 13300 }, { "epoch": 1.7296946068875894, "grad_norm": 0.7841046452522278, "learning_rate": 3.270305393112411e-05, "loss": 0.1658, "step": 13310 }, { "epoch": 1.7309941520467835, "grad_norm": 0.1784866601228714, "learning_rate": 3.2690058479532164e-05, "loss": 0.1948, "step": 13320 }, { "epoch": 1.732293697205978, "grad_norm": 0.4508218467235565, "learning_rate": 3.267706302794023e-05, "loss": 0.1223, "step": 13330 }, { "epoch": 1.7335932423651723, "grad_norm": 0.4451533854007721, "learning_rate": 3.2664067576348276e-05, "loss": 0.1586, "step": 13340 }, { "epoch": 1.7348927875243665, "grad_norm": 1.090923547744751, "learning_rate": 3.265107212475634e-05, "loss": 0.1371, "step": 13350 }, { "epoch": 1.7361923326835607, "grad_norm": 0.7383362054824829, "learning_rate": 3.2638076673164394e-05, "loss": 0.2027, "step": 13360 }, { "epoch": 1.737491877842755, "grad_norm": 1.3181321620941162, "learning_rate": 3.262508122157245e-05, "loss": 0.2946, "step": 13370 }, { "epoch": 1.7387914230019494, "grad_norm": 0.8942499756813049, "learning_rate": 3.261208576998051e-05, "loss": 0.1465, "step": 13380 }, { "epoch": 1.7400909681611436, "grad_norm": 0.555034875869751, "learning_rate": 3.259909031838856e-05, "loss": 0.2367, "step": 13390 }, { "epoch": 1.7413905133203378, "grad_norm": 0.9286365509033203, "learning_rate": 3.2586094866796625e-05, "loss": 0.2003, "step": 13400 }, { "epoch": 1.7426900584795322, "grad_norm": 0.42594191431999207, "learning_rate": 3.257309941520468e-05, "loss": 0.214, "step": 13410 }, { "epoch": 1.7439896036387266, "grad_norm": 0.6483904123306274, "learning_rate": 3.2560103963612736e-05, "loss": 0.1695, "step": 13420 }, { "epoch": 1.7452891487979207, "grad_norm": 0.1384648084640503, "learning_rate": 3.25471085120208e-05, "loss": 0.1692, "step": 13430 }, { "epoch": 1.746588693957115, "grad_norm": 1.2361787557601929, "learning_rate": 3.253411306042885e-05, "loss": 0.2048, "step": 13440 }, { "epoch": 1.7478882391163093, "grad_norm": 0.38589033484458923, "learning_rate": 3.252111760883691e-05, "loss": 0.1201, "step": 13450 }, { "epoch": 1.7491877842755037, "grad_norm": 0.529181718826294, "learning_rate": 3.2508122157244966e-05, "loss": 0.157, "step": 13460 }, { "epoch": 1.7504873294346979, "grad_norm": 0.449504554271698, "learning_rate": 3.249512670565302e-05, "loss": 0.1456, "step": 13470 }, { "epoch": 1.751786874593892, "grad_norm": 0.1310100555419922, "learning_rate": 3.2482131254061085e-05, "loss": 0.1256, "step": 13480 }, { "epoch": 1.7530864197530864, "grad_norm": 0.21035268902778625, "learning_rate": 3.2469135802469134e-05, "loss": 0.1308, "step": 13490 }, { "epoch": 1.7543859649122808, "grad_norm": 0.4227098822593689, "learning_rate": 3.24561403508772e-05, "loss": 0.2231, "step": 13500 }, { "epoch": 1.755685510071475, "grad_norm": 0.42051783204078674, "learning_rate": 3.244314489928525e-05, "loss": 0.1376, "step": 13510 }, { "epoch": 1.7569850552306692, "grad_norm": 0.4949670433998108, "learning_rate": 3.243014944769331e-05, "loss": 0.1709, "step": 13520 }, { "epoch": 1.7582846003898636, "grad_norm": 0.40912383794784546, "learning_rate": 3.241715399610137e-05, "loss": 0.13, "step": 13530 }, { "epoch": 1.759584145549058, "grad_norm": 0.2635163962841034, "learning_rate": 3.240415854450942e-05, "loss": 0.2018, "step": 13540 }, { "epoch": 1.7608836907082521, "grad_norm": 0.194112166762352, "learning_rate": 3.239116309291748e-05, "loss": 0.1341, "step": 13550 }, { "epoch": 1.7621832358674463, "grad_norm": 0.4350893497467041, "learning_rate": 3.237816764132554e-05, "loss": 0.1384, "step": 13560 }, { "epoch": 1.7634827810266407, "grad_norm": 0.5928441286087036, "learning_rate": 3.2365172189733594e-05, "loss": 0.1624, "step": 13570 }, { "epoch": 1.764782326185835, "grad_norm": 0.7090528607368469, "learning_rate": 3.235217673814166e-05, "loss": 0.1069, "step": 13580 }, { "epoch": 1.7660818713450293, "grad_norm": 0.5013406872749329, "learning_rate": 3.2339181286549706e-05, "loss": 0.1115, "step": 13590 }, { "epoch": 1.7673814165042234, "grad_norm": 0.5878395438194275, "learning_rate": 3.232618583495777e-05, "loss": 0.1467, "step": 13600 }, { "epoch": 1.7686809616634178, "grad_norm": 0.4357517659664154, "learning_rate": 3.231319038336582e-05, "loss": 0.1352, "step": 13610 }, { "epoch": 1.7699805068226122, "grad_norm": 0.3509741723537445, "learning_rate": 3.230019493177388e-05, "loss": 0.1808, "step": 13620 }, { "epoch": 1.7712800519818064, "grad_norm": 0.7217328548431396, "learning_rate": 3.228719948018194e-05, "loss": 0.2113, "step": 13630 }, { "epoch": 1.7725795971410006, "grad_norm": 0.3128218948841095, "learning_rate": 3.227420402858999e-05, "loss": 0.1391, "step": 13640 }, { "epoch": 1.773879142300195, "grad_norm": 0.5456250905990601, "learning_rate": 3.2261208576998055e-05, "loss": 0.2132, "step": 13650 }, { "epoch": 1.7751786874593893, "grad_norm": 0.44247162342071533, "learning_rate": 3.2248213125406104e-05, "loss": 0.1709, "step": 13660 }, { "epoch": 1.7764782326185835, "grad_norm": 0.702402651309967, "learning_rate": 3.2235217673814167e-05, "loss": 0.1597, "step": 13670 }, { "epoch": 1.7777777777777777, "grad_norm": 0.5839279294013977, "learning_rate": 3.222222222222223e-05, "loss": 0.1424, "step": 13680 }, { "epoch": 1.779077322936972, "grad_norm": 0.8189912438392639, "learning_rate": 3.220922677063028e-05, "loss": 0.1379, "step": 13690 }, { "epoch": 1.7803768680961665, "grad_norm": 0.23380646109580994, "learning_rate": 3.219623131903834e-05, "loss": 0.1973, "step": 13700 }, { "epoch": 1.7816764132553606, "grad_norm": 0.6010541915893555, "learning_rate": 3.218323586744639e-05, "loss": 0.2403, "step": 13710 }, { "epoch": 1.7829759584145548, "grad_norm": 0.7049251198768616, "learning_rate": 3.217024041585445e-05, "loss": 0.1773, "step": 13720 }, { "epoch": 1.7842755035737492, "grad_norm": 0.3161669075489044, "learning_rate": 3.215724496426251e-05, "loss": 0.1583, "step": 13730 }, { "epoch": 1.7855750487329436, "grad_norm": 0.26356351375579834, "learning_rate": 3.2144249512670564e-05, "loss": 0.1538, "step": 13740 }, { "epoch": 1.7868745938921378, "grad_norm": 0.32545047998428345, "learning_rate": 3.213125406107863e-05, "loss": 0.1675, "step": 13750 }, { "epoch": 1.788174139051332, "grad_norm": 0.9317497611045837, "learning_rate": 3.2118258609486676e-05, "loss": 0.2152, "step": 13760 }, { "epoch": 1.7894736842105263, "grad_norm": 0.24868439137935638, "learning_rate": 3.210526315789474e-05, "loss": 0.2034, "step": 13770 }, { "epoch": 1.7907732293697207, "grad_norm": 0.5433592796325684, "learning_rate": 3.2092267706302795e-05, "loss": 0.227, "step": 13780 }, { "epoch": 1.792072774528915, "grad_norm": 0.26896002888679504, "learning_rate": 3.207927225471085e-05, "loss": 0.1638, "step": 13790 }, { "epoch": 1.793372319688109, "grad_norm": 0.5252446532249451, "learning_rate": 3.206627680311891e-05, "loss": 0.1612, "step": 13800 }, { "epoch": 1.7946718648473035, "grad_norm": 0.21503140032291412, "learning_rate": 3.205328135152696e-05, "loss": 0.1548, "step": 13810 }, { "epoch": 1.7959714100064978, "grad_norm": 0.44032639265060425, "learning_rate": 3.2040285899935025e-05, "loss": 0.1917, "step": 13820 }, { "epoch": 1.797270955165692, "grad_norm": 0.13984175026416779, "learning_rate": 3.202729044834308e-05, "loss": 0.1391, "step": 13830 }, { "epoch": 1.7985705003248862, "grad_norm": 0.20680394768714905, "learning_rate": 3.2014294996751136e-05, "loss": 0.2843, "step": 13840 }, { "epoch": 1.7998700454840806, "grad_norm": 0.7254112958908081, "learning_rate": 3.20012995451592e-05, "loss": 0.2702, "step": 13850 }, { "epoch": 1.801169590643275, "grad_norm": 0.43737471103668213, "learning_rate": 3.198830409356725e-05, "loss": 0.2108, "step": 13860 }, { "epoch": 1.8024691358024691, "grad_norm": 0.7318388819694519, "learning_rate": 3.197530864197531e-05, "loss": 0.1457, "step": 13870 }, { "epoch": 1.8037686809616633, "grad_norm": 0.31557515263557434, "learning_rate": 3.196231319038337e-05, "loss": 0.2315, "step": 13880 }, { "epoch": 1.8050682261208577, "grad_norm": 0.4905223548412323, "learning_rate": 3.194931773879142e-05, "loss": 0.2188, "step": 13890 }, { "epoch": 1.806367771280052, "grad_norm": 0.6402523517608643, "learning_rate": 3.1936322287199485e-05, "loss": 0.1824, "step": 13900 }, { "epoch": 1.8076673164392463, "grad_norm": 0.4821411669254303, "learning_rate": 3.1923326835607534e-05, "loss": 0.1102, "step": 13910 }, { "epoch": 1.8089668615984404, "grad_norm": 0.21423040330410004, "learning_rate": 3.19103313840156e-05, "loss": 0.1434, "step": 13920 }, { "epoch": 1.8102664067576348, "grad_norm": 0.46749547123908997, "learning_rate": 3.189733593242365e-05, "loss": 0.2074, "step": 13930 }, { "epoch": 1.8115659519168292, "grad_norm": 0.5757670998573303, "learning_rate": 3.188434048083171e-05, "loss": 0.1291, "step": 13940 }, { "epoch": 1.8128654970760234, "grad_norm": 0.28579702973365784, "learning_rate": 3.187134502923977e-05, "loss": 0.1984, "step": 13950 }, { "epoch": 1.8141650422352176, "grad_norm": 0.5650652647018433, "learning_rate": 3.185834957764782e-05, "loss": 0.151, "step": 13960 }, { "epoch": 1.815464587394412, "grad_norm": 0.6889373660087585, "learning_rate": 3.184535412605588e-05, "loss": 0.1758, "step": 13970 }, { "epoch": 1.8167641325536064, "grad_norm": 0.8544249534606934, "learning_rate": 3.183235867446394e-05, "loss": 0.1691, "step": 13980 }, { "epoch": 1.8180636777128005, "grad_norm": 0.8953562378883362, "learning_rate": 3.1819363222871995e-05, "loss": 0.1641, "step": 13990 }, { "epoch": 1.8193632228719947, "grad_norm": 0.5575627684593201, "learning_rate": 3.180636777128006e-05, "loss": 0.1989, "step": 14000 }, { "epoch": 1.820662768031189, "grad_norm": 0.3218742907047272, "learning_rate": 3.1793372319688106e-05, "loss": 0.2155, "step": 14010 }, { "epoch": 1.8219623131903835, "grad_norm": 0.18076352775096893, "learning_rate": 3.178037686809617e-05, "loss": 0.1666, "step": 14020 }, { "epoch": 1.8232618583495777, "grad_norm": 0.5052632093429565, "learning_rate": 3.1767381416504225e-05, "loss": 0.1635, "step": 14030 }, { "epoch": 1.8245614035087718, "grad_norm": 0.3427061140537262, "learning_rate": 3.175438596491228e-05, "loss": 0.1929, "step": 14040 }, { "epoch": 1.8258609486679662, "grad_norm": 0.3823476731777191, "learning_rate": 3.174139051332034e-05, "loss": 0.1211, "step": 14050 }, { "epoch": 1.8271604938271606, "grad_norm": 0.3676934540271759, "learning_rate": 3.172839506172839e-05, "loss": 0.1452, "step": 14060 }, { "epoch": 1.8284600389863548, "grad_norm": 0.19330710172653198, "learning_rate": 3.1715399610136455e-05, "loss": 0.1182, "step": 14070 }, { "epoch": 1.829759584145549, "grad_norm": 0.5925936698913574, "learning_rate": 3.170240415854451e-05, "loss": 0.1892, "step": 14080 }, { "epoch": 1.8310591293047433, "grad_norm": 0.7980806827545166, "learning_rate": 3.168940870695257e-05, "loss": 0.1717, "step": 14090 }, { "epoch": 1.8323586744639377, "grad_norm": 0.40725287795066833, "learning_rate": 3.167641325536063e-05, "loss": 0.1286, "step": 14100 }, { "epoch": 1.833658219623132, "grad_norm": 0.35505935549736023, "learning_rate": 3.166341780376868e-05, "loss": 0.1278, "step": 14110 }, { "epoch": 1.834957764782326, "grad_norm": 0.43881702423095703, "learning_rate": 3.165042235217674e-05, "loss": 0.2052, "step": 14120 }, { "epoch": 1.8362573099415205, "grad_norm": 0.8321799039840698, "learning_rate": 3.16374269005848e-05, "loss": 0.1981, "step": 14130 }, { "epoch": 1.8375568551007149, "grad_norm": 0.85514897108078, "learning_rate": 3.162443144899285e-05, "loss": 0.2257, "step": 14140 }, { "epoch": 1.838856400259909, "grad_norm": 0.7052195072174072, "learning_rate": 3.1611435997400915e-05, "loss": 0.2141, "step": 14150 }, { "epoch": 1.8401559454191032, "grad_norm": 0.30936312675476074, "learning_rate": 3.1598440545808964e-05, "loss": 0.2593, "step": 14160 }, { "epoch": 1.8414554905782976, "grad_norm": 0.38299545645713806, "learning_rate": 3.158544509421703e-05, "loss": 0.1204, "step": 14170 }, { "epoch": 1.842755035737492, "grad_norm": 0.7088260054588318, "learning_rate": 3.157244964262508e-05, "loss": 0.153, "step": 14180 }, { "epoch": 1.8440545808966862, "grad_norm": 0.3385081887245178, "learning_rate": 3.155945419103314e-05, "loss": 0.1912, "step": 14190 }, { "epoch": 1.8453541260558803, "grad_norm": 0.2500070631504059, "learning_rate": 3.15464587394412e-05, "loss": 0.1398, "step": 14200 }, { "epoch": 1.8466536712150747, "grad_norm": 0.41618502140045166, "learning_rate": 3.153346328784925e-05, "loss": 0.1089, "step": 14210 }, { "epoch": 1.8479532163742691, "grad_norm": 0.6627466678619385, "learning_rate": 3.152046783625731e-05, "loss": 0.2121, "step": 14220 }, { "epoch": 1.8492527615334633, "grad_norm": 0.18631447851657867, "learning_rate": 3.150747238466537e-05, "loss": 0.0982, "step": 14230 }, { "epoch": 1.8505523066926575, "grad_norm": 1.089392066001892, "learning_rate": 3.1494476933073425e-05, "loss": 0.199, "step": 14240 }, { "epoch": 1.8518518518518519, "grad_norm": 0.5951752662658691, "learning_rate": 3.148148148148148e-05, "loss": 0.1864, "step": 14250 }, { "epoch": 1.8531513970110463, "grad_norm": 0.6356686949729919, "learning_rate": 3.1468486029889537e-05, "loss": 0.1761, "step": 14260 }, { "epoch": 1.8544509421702404, "grad_norm": 0.4095602035522461, "learning_rate": 3.14554905782976e-05, "loss": 0.1914, "step": 14270 }, { "epoch": 1.8557504873294346, "grad_norm": 0.8940890431404114, "learning_rate": 3.1442495126705655e-05, "loss": 0.144, "step": 14280 }, { "epoch": 1.857050032488629, "grad_norm": 0.44039613008499146, "learning_rate": 3.142949967511371e-05, "loss": 0.2411, "step": 14290 }, { "epoch": 1.8583495776478234, "grad_norm": 0.3076001703739166, "learning_rate": 3.141650422352177e-05, "loss": 0.1448, "step": 14300 }, { "epoch": 1.8596491228070176, "grad_norm": 2.0991063117980957, "learning_rate": 3.140350877192982e-05, "loss": 0.2581, "step": 14310 }, { "epoch": 1.8609486679662117, "grad_norm": 0.6965035796165466, "learning_rate": 3.1390513320337885e-05, "loss": 0.1388, "step": 14320 }, { "epoch": 1.8622482131254061, "grad_norm": 0.41939210891723633, "learning_rate": 3.137751786874594e-05, "loss": 0.1633, "step": 14330 }, { "epoch": 1.8635477582846005, "grad_norm": 0.37727847695350647, "learning_rate": 3.1364522417154e-05, "loss": 0.1348, "step": 14340 }, { "epoch": 1.8648473034437947, "grad_norm": 0.271004855632782, "learning_rate": 3.135152696556205e-05, "loss": 0.1502, "step": 14350 }, { "epoch": 1.8661468486029889, "grad_norm": 0.2245119959115982, "learning_rate": 3.133853151397011e-05, "loss": 0.1238, "step": 14360 }, { "epoch": 1.8674463937621832, "grad_norm": 0.2907504439353943, "learning_rate": 3.132553606237817e-05, "loss": 0.1315, "step": 14370 }, { "epoch": 1.8687459389213776, "grad_norm": 0.32418856024742126, "learning_rate": 3.131254061078623e-05, "loss": 0.1864, "step": 14380 }, { "epoch": 1.8700454840805718, "grad_norm": 0.5182332992553711, "learning_rate": 3.129954515919428e-05, "loss": 0.173, "step": 14390 }, { "epoch": 1.871345029239766, "grad_norm": 0.5861763954162598, "learning_rate": 3.128654970760234e-05, "loss": 0.1797, "step": 14400 }, { "epoch": 1.8726445743989604, "grad_norm": 0.42898136377334595, "learning_rate": 3.1273554256010395e-05, "loss": 0.181, "step": 14410 }, { "epoch": 1.8739441195581548, "grad_norm": 0.30955934524536133, "learning_rate": 3.126055880441846e-05, "loss": 0.2853, "step": 14420 }, { "epoch": 1.875243664717349, "grad_norm": 0.2213142067193985, "learning_rate": 3.124756335282651e-05, "loss": 0.1671, "step": 14430 }, { "epoch": 1.876543209876543, "grad_norm": 0.26573047041893005, "learning_rate": 3.123456790123457e-05, "loss": 0.1642, "step": 14440 }, { "epoch": 1.8778427550357375, "grad_norm": 0.4463496208190918, "learning_rate": 3.1221572449642625e-05, "loss": 0.1735, "step": 14450 }, { "epoch": 1.879142300194932, "grad_norm": 0.49962425231933594, "learning_rate": 3.120857699805068e-05, "loss": 0.2003, "step": 14460 }, { "epoch": 1.880441845354126, "grad_norm": 0.7352392077445984, "learning_rate": 3.1195581546458743e-05, "loss": 0.1911, "step": 14470 }, { "epoch": 1.8817413905133202, "grad_norm": 0.44394344091415405, "learning_rate": 3.11825860948668e-05, "loss": 0.1618, "step": 14480 }, { "epoch": 1.8830409356725146, "grad_norm": 0.669462263584137, "learning_rate": 3.1169590643274855e-05, "loss": 0.1538, "step": 14490 }, { "epoch": 1.884340480831709, "grad_norm": 0.8166868686676025, "learning_rate": 3.115659519168291e-05, "loss": 0.2345, "step": 14500 }, { "epoch": 1.8856400259909032, "grad_norm": 0.3486957550048828, "learning_rate": 3.114359974009097e-05, "loss": 0.1639, "step": 14510 }, { "epoch": 1.8869395711500974, "grad_norm": 0.5073180794715881, "learning_rate": 3.113060428849903e-05, "loss": 0.1945, "step": 14520 }, { "epoch": 1.8882391163092918, "grad_norm": 0.36370372772216797, "learning_rate": 3.1117608836907085e-05, "loss": 0.1297, "step": 14530 }, { "epoch": 1.8895386614684861, "grad_norm": 0.42234060168266296, "learning_rate": 3.110461338531514e-05, "loss": 0.1419, "step": 14540 }, { "epoch": 1.8908382066276803, "grad_norm": 0.418069064617157, "learning_rate": 3.10916179337232e-05, "loss": 0.1334, "step": 14550 }, { "epoch": 1.8921377517868745, "grad_norm": 0.3264067769050598, "learning_rate": 3.107862248213125e-05, "loss": 0.1555, "step": 14560 }, { "epoch": 1.8934372969460689, "grad_norm": 0.8600042462348938, "learning_rate": 3.1065627030539316e-05, "loss": 0.191, "step": 14570 }, { "epoch": 1.8947368421052633, "grad_norm": 0.14986300468444824, "learning_rate": 3.105263157894737e-05, "loss": 0.1522, "step": 14580 }, { "epoch": 1.8960363872644574, "grad_norm": 0.22737647593021393, "learning_rate": 3.103963612735543e-05, "loss": 0.1602, "step": 14590 }, { "epoch": 1.8973359324236516, "grad_norm": 0.4719073176383972, "learning_rate": 3.102664067576348e-05, "loss": 0.1782, "step": 14600 }, { "epoch": 1.898635477582846, "grad_norm": 0.19554801285266876, "learning_rate": 3.101364522417154e-05, "loss": 0.1071, "step": 14610 }, { "epoch": 1.8999350227420404, "grad_norm": 0.26606082916259766, "learning_rate": 3.10006497725796e-05, "loss": 0.1633, "step": 14620 }, { "epoch": 1.9012345679012346, "grad_norm": 0.4376918375492096, "learning_rate": 3.098765432098766e-05, "loss": 0.1827, "step": 14630 }, { "epoch": 1.9025341130604287, "grad_norm": 0.20063336193561554, "learning_rate": 3.097465886939571e-05, "loss": 0.1494, "step": 14640 }, { "epoch": 1.9038336582196231, "grad_norm": 0.6916656494140625, "learning_rate": 3.096166341780377e-05, "loss": 0.1667, "step": 14650 }, { "epoch": 1.9051332033788175, "grad_norm": 0.7167173624038696, "learning_rate": 3.0948667966211825e-05, "loss": 0.1263, "step": 14660 }, { "epoch": 1.9064327485380117, "grad_norm": 0.9046335816383362, "learning_rate": 3.093567251461989e-05, "loss": 0.1991, "step": 14670 }, { "epoch": 1.9077322936972059, "grad_norm": 0.28706061840057373, "learning_rate": 3.0922677063027944e-05, "loss": 0.2088, "step": 14680 }, { "epoch": 1.9090318388564003, "grad_norm": 0.4261971712112427, "learning_rate": 3.0909681611436e-05, "loss": 0.1713, "step": 14690 }, { "epoch": 1.9103313840155947, "grad_norm": 0.383404940366745, "learning_rate": 3.0896686159844055e-05, "loss": 0.2081, "step": 14700 }, { "epoch": 1.9116309291747888, "grad_norm": 0.3268422484397888, "learning_rate": 3.088369070825211e-05, "loss": 0.1727, "step": 14710 }, { "epoch": 1.912930474333983, "grad_norm": 0.29814964532852173, "learning_rate": 3.0870695256660174e-05, "loss": 0.1386, "step": 14720 }, { "epoch": 1.9142300194931774, "grad_norm": 0.37566468119621277, "learning_rate": 3.085769980506823e-05, "loss": 0.1227, "step": 14730 }, { "epoch": 1.9155295646523718, "grad_norm": 0.7573511600494385, "learning_rate": 3.0844704353476285e-05, "loss": 0.1545, "step": 14740 }, { "epoch": 1.916829109811566, "grad_norm": 0.2716207802295685, "learning_rate": 3.083170890188434e-05, "loss": 0.1726, "step": 14750 }, { "epoch": 1.9181286549707601, "grad_norm": 0.38272354006767273, "learning_rate": 3.08187134502924e-05, "loss": 0.185, "step": 14760 }, { "epoch": 1.9194282001299545, "grad_norm": 0.7776551246643066, "learning_rate": 3.080571799870045e-05, "loss": 0.1699, "step": 14770 }, { "epoch": 1.920727745289149, "grad_norm": 0.13415709137916565, "learning_rate": 3.0792722547108516e-05, "loss": 0.1572, "step": 14780 }, { "epoch": 1.922027290448343, "grad_norm": 0.5100064277648926, "learning_rate": 3.077972709551657e-05, "loss": 0.1121, "step": 14790 }, { "epoch": 1.9233268356075373, "grad_norm": 0.5043372511863708, "learning_rate": 3.076673164392463e-05, "loss": 0.1467, "step": 14800 }, { "epoch": 1.9246263807667316, "grad_norm": 0.493566632270813, "learning_rate": 3.075373619233268e-05, "loss": 0.1677, "step": 14810 }, { "epoch": 1.925925925925926, "grad_norm": 0.3528434932231903, "learning_rate": 3.074074074074074e-05, "loss": 0.1499, "step": 14820 }, { "epoch": 1.9272254710851202, "grad_norm": 0.30238714814186096, "learning_rate": 3.07277452891488e-05, "loss": 0.1453, "step": 14830 }, { "epoch": 1.9285250162443144, "grad_norm": 0.23013989627361298, "learning_rate": 3.071474983755686e-05, "loss": 0.16, "step": 14840 }, { "epoch": 1.9298245614035088, "grad_norm": 0.14666767418384552, "learning_rate": 3.0701754385964913e-05, "loss": 0.1256, "step": 14850 }, { "epoch": 1.9311241065627032, "grad_norm": 0.3400559723377228, "learning_rate": 3.068875893437297e-05, "loss": 0.2292, "step": 14860 }, { "epoch": 1.9324236517218973, "grad_norm": 0.30508071184158325, "learning_rate": 3.0675763482781025e-05, "loss": 0.2141, "step": 14870 }, { "epoch": 1.9337231968810915, "grad_norm": 0.30597618222236633, "learning_rate": 3.066276803118909e-05, "loss": 0.1431, "step": 14880 }, { "epoch": 1.935022742040286, "grad_norm": 0.5379406213760376, "learning_rate": 3.0649772579597144e-05, "loss": 0.1408, "step": 14890 }, { "epoch": 1.9363222871994803, "grad_norm": 0.2921736240386963, "learning_rate": 3.06367771280052e-05, "loss": 0.1499, "step": 14900 }, { "epoch": 1.9376218323586745, "grad_norm": 0.39923420548439026, "learning_rate": 3.0623781676413255e-05, "loss": 0.1209, "step": 14910 }, { "epoch": 1.9389213775178686, "grad_norm": 0.6372909545898438, "learning_rate": 3.061078622482131e-05, "loss": 0.1652, "step": 14920 }, { "epoch": 1.940220922677063, "grad_norm": 0.25774288177490234, "learning_rate": 3.0597790773229374e-05, "loss": 0.1993, "step": 14930 }, { "epoch": 1.9415204678362574, "grad_norm": 0.43964388966560364, "learning_rate": 3.058479532163743e-05, "loss": 0.1478, "step": 14940 }, { "epoch": 1.9428200129954516, "grad_norm": 0.299450159072876, "learning_rate": 3.0571799870045486e-05, "loss": 0.1657, "step": 14950 }, { "epoch": 1.9441195581546458, "grad_norm": 0.3755657374858856, "learning_rate": 3.055880441845354e-05, "loss": 0.219, "step": 14960 }, { "epoch": 1.9454191033138402, "grad_norm": 0.5183466076850891, "learning_rate": 3.05458089668616e-05, "loss": 0.2062, "step": 14970 }, { "epoch": 1.9467186484730346, "grad_norm": 0.5389581918716431, "learning_rate": 3.053281351526966e-05, "loss": 0.1806, "step": 14980 }, { "epoch": 1.9480181936322287, "grad_norm": 0.39337313175201416, "learning_rate": 3.0519818063677716e-05, "loss": 0.158, "step": 14990 }, { "epoch": 1.949317738791423, "grad_norm": 0.446102499961853, "learning_rate": 3.050682261208577e-05, "loss": 0.2488, "step": 15000 }, { "epoch": 1.9506172839506173, "grad_norm": 0.7877377867698669, "learning_rate": 3.0493827160493827e-05, "loss": 0.1915, "step": 15010 }, { "epoch": 1.9519168291098117, "grad_norm": 0.19605094194412231, "learning_rate": 3.0480831708901887e-05, "loss": 0.1176, "step": 15020 }, { "epoch": 1.9532163742690059, "grad_norm": 0.5737112164497375, "learning_rate": 3.0467836257309946e-05, "loss": 0.1866, "step": 15030 }, { "epoch": 1.9545159194282, "grad_norm": 0.2602459490299225, "learning_rate": 3.0454840805718e-05, "loss": 0.1742, "step": 15040 }, { "epoch": 1.9558154645873944, "grad_norm": 0.29080647230148315, "learning_rate": 3.0441845354126058e-05, "loss": 0.2432, "step": 15050 }, { "epoch": 1.9571150097465888, "grad_norm": 0.3889710307121277, "learning_rate": 3.0428849902534113e-05, "loss": 0.152, "step": 15060 }, { "epoch": 1.958414554905783, "grad_norm": 0.255004346370697, "learning_rate": 3.0415854450942173e-05, "loss": 0.2021, "step": 15070 }, { "epoch": 1.9597141000649771, "grad_norm": 0.22947849333286285, "learning_rate": 3.0402858999350232e-05, "loss": 0.1878, "step": 15080 }, { "epoch": 1.9610136452241715, "grad_norm": 0.1205660030245781, "learning_rate": 3.0389863547758284e-05, "loss": 0.1791, "step": 15090 }, { "epoch": 1.962313190383366, "grad_norm": 0.4840257167816162, "learning_rate": 3.0376868096166344e-05, "loss": 0.149, "step": 15100 }, { "epoch": 1.96361273554256, "grad_norm": 0.16355867683887482, "learning_rate": 3.03638726445744e-05, "loss": 0.1703, "step": 15110 }, { "epoch": 1.9649122807017543, "grad_norm": 0.7035380601882935, "learning_rate": 3.035087719298246e-05, "loss": 0.2053, "step": 15120 }, { "epoch": 1.9662118258609487, "grad_norm": 0.9461174011230469, "learning_rate": 3.0337881741390518e-05, "loss": 0.1243, "step": 15130 }, { "epoch": 1.967511371020143, "grad_norm": 0.3603643476963043, "learning_rate": 3.032488628979857e-05, "loss": 0.1421, "step": 15140 }, { "epoch": 1.9688109161793372, "grad_norm": 0.3574307858943939, "learning_rate": 3.031189083820663e-05, "loss": 0.1724, "step": 15150 }, { "epoch": 1.9701104613385314, "grad_norm": 0.1741548627614975, "learning_rate": 3.0298895386614686e-05, "loss": 0.2023, "step": 15160 }, { "epoch": 1.9714100064977258, "grad_norm": 0.7150457501411438, "learning_rate": 3.0285899935022745e-05, "loss": 0.1741, "step": 15170 }, { "epoch": 1.9727095516569202, "grad_norm": 0.3186459243297577, "learning_rate": 3.0272904483430804e-05, "loss": 0.1388, "step": 15180 }, { "epoch": 1.9740090968161144, "grad_norm": 0.4231090545654297, "learning_rate": 3.0259909031838857e-05, "loss": 0.161, "step": 15190 }, { "epoch": 1.9753086419753085, "grad_norm": 0.3624158501625061, "learning_rate": 3.0246913580246916e-05, "loss": 0.2249, "step": 15200 }, { "epoch": 1.976608187134503, "grad_norm": 0.23652219772338867, "learning_rate": 3.023391812865497e-05, "loss": 0.1775, "step": 15210 }, { "epoch": 1.9779077322936973, "grad_norm": 0.9364799857139587, "learning_rate": 3.022092267706303e-05, "loss": 0.1986, "step": 15220 }, { "epoch": 1.9792072774528915, "grad_norm": 1.0035923719406128, "learning_rate": 3.020792722547109e-05, "loss": 0.2297, "step": 15230 }, { "epoch": 1.9805068226120857, "grad_norm": 0.42337673902511597, "learning_rate": 3.0194931773879143e-05, "loss": 0.1759, "step": 15240 }, { "epoch": 1.98180636777128, "grad_norm": 0.6450037956237793, "learning_rate": 3.0181936322287202e-05, "loss": 0.1334, "step": 15250 }, { "epoch": 1.9831059129304744, "grad_norm": 0.45546087622642517, "learning_rate": 3.0168940870695254e-05, "loss": 0.108, "step": 15260 }, { "epoch": 1.9844054580896686, "grad_norm": 0.37234658002853394, "learning_rate": 3.0155945419103317e-05, "loss": 0.1688, "step": 15270 }, { "epoch": 1.9857050032488628, "grad_norm": 0.37991148233413696, "learning_rate": 3.0142949967511376e-05, "loss": 0.1624, "step": 15280 }, { "epoch": 1.9870045484080572, "grad_norm": 0.4407542645931244, "learning_rate": 3.012995451591943e-05, "loss": 0.1937, "step": 15290 }, { "epoch": 1.9883040935672516, "grad_norm": 0.501749575138092, "learning_rate": 3.0116959064327488e-05, "loss": 0.2048, "step": 15300 }, { "epoch": 1.9896036387264457, "grad_norm": 0.31212615966796875, "learning_rate": 3.010396361273554e-05, "loss": 0.1401, "step": 15310 }, { "epoch": 1.99090318388564, "grad_norm": 0.4380424916744232, "learning_rate": 3.00909681611436e-05, "loss": 0.1791, "step": 15320 }, { "epoch": 1.9922027290448343, "grad_norm": 0.2605857849121094, "learning_rate": 3.0077972709551662e-05, "loss": 0.1329, "step": 15330 }, { "epoch": 1.9935022742040287, "grad_norm": 0.164524644613266, "learning_rate": 3.0064977257959715e-05, "loss": 0.1145, "step": 15340 }, { "epoch": 1.9948018193632229, "grad_norm": 0.3236043453216553, "learning_rate": 3.0051981806367774e-05, "loss": 0.1589, "step": 15350 }, { "epoch": 1.996101364522417, "grad_norm": 0.765828549861908, "learning_rate": 3.0038986354775826e-05, "loss": 0.1897, "step": 15360 }, { "epoch": 1.9974009096816114, "grad_norm": 1.0739402770996094, "learning_rate": 3.0025990903183886e-05, "loss": 0.2015, "step": 15370 }, { "epoch": 1.9987004548408058, "grad_norm": 0.5038200616836548, "learning_rate": 3.0012995451591945e-05, "loss": 0.1642, "step": 15380 }, { "epoch": 2.0, "grad_norm": 1.209784984588623, "learning_rate": 3e-05, "loss": 0.2042, "step": 15390 }, { "epoch": 2.0, "eval_loss": 0.12356271594762802, "eval_runtime": 854.8392, "eval_samples_per_second": 9.002, "eval_steps_per_second": 9.002, "step": 15390 }, { "epoch": 2.001299545159194, "grad_norm": 0.3012406826019287, "learning_rate": 2.998700454840806e-05, "loss": 0.1203, "step": 15400 }, { "epoch": 2.0025990903183883, "grad_norm": 0.45136338472366333, "learning_rate": 2.9974009096816112e-05, "loss": 0.1029, "step": 15410 }, { "epoch": 2.003898635477583, "grad_norm": 0.4245690107345581, "learning_rate": 2.9961013645224172e-05, "loss": 0.1793, "step": 15420 }, { "epoch": 2.005198180636777, "grad_norm": 0.6124729514122009, "learning_rate": 2.994801819363223e-05, "loss": 0.1531, "step": 15430 }, { "epoch": 2.0064977257959713, "grad_norm": 0.573155403137207, "learning_rate": 2.9935022742040287e-05, "loss": 0.1357, "step": 15440 }, { "epoch": 2.007797270955166, "grad_norm": 0.23088154196739197, "learning_rate": 2.9922027290448346e-05, "loss": 0.1393, "step": 15450 }, { "epoch": 2.00909681611436, "grad_norm": 1.1306272745132446, "learning_rate": 2.99090318388564e-05, "loss": 0.1866, "step": 15460 }, { "epoch": 2.0103963612735543, "grad_norm": 0.5896508693695068, "learning_rate": 2.9896036387264458e-05, "loss": 0.1187, "step": 15470 }, { "epoch": 2.0116959064327484, "grad_norm": 0.7323217988014221, "learning_rate": 2.9883040935672517e-05, "loss": 0.1379, "step": 15480 }, { "epoch": 2.0129954515919426, "grad_norm": 0.3027571737766266, "learning_rate": 2.9870045484080573e-05, "loss": 0.1094, "step": 15490 }, { "epoch": 2.014294996751137, "grad_norm": 0.5248409509658813, "learning_rate": 2.9857050032488632e-05, "loss": 0.1581, "step": 15500 }, { "epoch": 2.0155945419103314, "grad_norm": 0.2591512203216553, "learning_rate": 2.9844054580896685e-05, "loss": 0.1234, "step": 15510 }, { "epoch": 2.0168940870695256, "grad_norm": 0.2647464871406555, "learning_rate": 2.9831059129304744e-05, "loss": 0.1011, "step": 15520 }, { "epoch": 2.01819363222872, "grad_norm": 0.46301883459091187, "learning_rate": 2.9818063677712803e-05, "loss": 0.1705, "step": 15530 }, { "epoch": 2.0194931773879143, "grad_norm": 0.34709951281547546, "learning_rate": 2.980506822612086e-05, "loss": 0.1519, "step": 15540 }, { "epoch": 2.0207927225471085, "grad_norm": 0.8779345750808716, "learning_rate": 2.9792072774528918e-05, "loss": 0.1397, "step": 15550 }, { "epoch": 2.0220922677063027, "grad_norm": 0.45668038725852966, "learning_rate": 2.977907732293697e-05, "loss": 0.1315, "step": 15560 }, { "epoch": 2.023391812865497, "grad_norm": 0.842630922794342, "learning_rate": 2.976608187134503e-05, "loss": 0.1144, "step": 15570 }, { "epoch": 2.0246913580246915, "grad_norm": 0.3668392598628998, "learning_rate": 2.975308641975309e-05, "loss": 0.1295, "step": 15580 }, { "epoch": 2.0259909031838856, "grad_norm": 0.4033503830432892, "learning_rate": 2.9740090968161145e-05, "loss": 0.2221, "step": 15590 }, { "epoch": 2.02729044834308, "grad_norm": 0.7159742712974548, "learning_rate": 2.9727095516569204e-05, "loss": 0.1464, "step": 15600 }, { "epoch": 2.0285899935022744, "grad_norm": 0.27073732018470764, "learning_rate": 2.9714100064977257e-05, "loss": 0.126, "step": 15610 }, { "epoch": 2.0298895386614686, "grad_norm": 0.44027355313301086, "learning_rate": 2.9701104613385316e-05, "loss": 0.1102, "step": 15620 }, { "epoch": 2.0311890838206628, "grad_norm": 0.4421376883983612, "learning_rate": 2.9688109161793375e-05, "loss": 0.1177, "step": 15630 }, { "epoch": 2.032488628979857, "grad_norm": 0.3860877454280853, "learning_rate": 2.967511371020143e-05, "loss": 0.0986, "step": 15640 }, { "epoch": 2.033788174139051, "grad_norm": 0.28295642137527466, "learning_rate": 2.966211825860949e-05, "loss": 0.1137, "step": 15650 }, { "epoch": 2.0350877192982457, "grad_norm": 0.3138054311275482, "learning_rate": 2.9649122807017543e-05, "loss": 0.1578, "step": 15660 }, { "epoch": 2.03638726445744, "grad_norm": 0.2723567485809326, "learning_rate": 2.9636127355425602e-05, "loss": 0.1248, "step": 15670 }, { "epoch": 2.037686809616634, "grad_norm": 0.5099914073944092, "learning_rate": 2.962313190383366e-05, "loss": 0.1256, "step": 15680 }, { "epoch": 2.0389863547758287, "grad_norm": 0.9252695441246033, "learning_rate": 2.9610136452241717e-05, "loss": 0.1835, "step": 15690 }, { "epoch": 2.040285899935023, "grad_norm": 0.2059582620859146, "learning_rate": 2.9597141000649776e-05, "loss": 0.1675, "step": 15700 }, { "epoch": 2.041585445094217, "grad_norm": 0.8724639415740967, "learning_rate": 2.958414554905783e-05, "loss": 0.1355, "step": 15710 }, { "epoch": 2.042884990253411, "grad_norm": 0.46018508076667786, "learning_rate": 2.9571150097465888e-05, "loss": 0.1154, "step": 15720 }, { "epoch": 2.0441845354126054, "grad_norm": 0.31214484572410583, "learning_rate": 2.9558154645873947e-05, "loss": 0.1301, "step": 15730 }, { "epoch": 2.0454840805718, "grad_norm": 0.8946526646614075, "learning_rate": 2.9545159194282003e-05, "loss": 0.2275, "step": 15740 }, { "epoch": 2.046783625730994, "grad_norm": 0.41382327675819397, "learning_rate": 2.9532163742690062e-05, "loss": 0.1704, "step": 15750 }, { "epoch": 2.0480831708901883, "grad_norm": 1.2137680053710938, "learning_rate": 2.9519168291098115e-05, "loss": 0.154, "step": 15760 }, { "epoch": 2.049382716049383, "grad_norm": 0.46422940492630005, "learning_rate": 2.9506172839506174e-05, "loss": 0.1082, "step": 15770 }, { "epoch": 2.050682261208577, "grad_norm": 0.7331463098526001, "learning_rate": 2.9493177387914233e-05, "loss": 0.1038, "step": 15780 }, { "epoch": 2.0519818063677713, "grad_norm": 0.9033154845237732, "learning_rate": 2.948018193632229e-05, "loss": 0.1388, "step": 15790 }, { "epoch": 2.0532813515269654, "grad_norm": 0.5128363370895386, "learning_rate": 2.946718648473035e-05, "loss": 0.1181, "step": 15800 }, { "epoch": 2.0545808966861596, "grad_norm": 0.3344348669052124, "learning_rate": 2.94541910331384e-05, "loss": 0.1153, "step": 15810 }, { "epoch": 2.0558804418453542, "grad_norm": 0.4547254741191864, "learning_rate": 2.944119558154646e-05, "loss": 0.0965, "step": 15820 }, { "epoch": 2.0571799870045484, "grad_norm": 0.28738540410995483, "learning_rate": 2.942820012995452e-05, "loss": 0.1568, "step": 15830 }, { "epoch": 2.0584795321637426, "grad_norm": 0.4281133711338043, "learning_rate": 2.9415204678362572e-05, "loss": 0.1246, "step": 15840 }, { "epoch": 2.059779077322937, "grad_norm": 0.5161533355712891, "learning_rate": 2.9402209226770635e-05, "loss": 0.1142, "step": 15850 }, { "epoch": 2.0610786224821314, "grad_norm": 0.35561051964759827, "learning_rate": 2.9389213775178687e-05, "loss": 0.1506, "step": 15860 }, { "epoch": 2.0623781676413255, "grad_norm": 0.24043315649032593, "learning_rate": 2.9376218323586746e-05, "loss": 0.1132, "step": 15870 }, { "epoch": 2.0636777128005197, "grad_norm": 0.5305408835411072, "learning_rate": 2.9363222871994805e-05, "loss": 0.1414, "step": 15880 }, { "epoch": 2.064977257959714, "grad_norm": 0.6524245142936707, "learning_rate": 2.9350227420402858e-05, "loss": 0.0959, "step": 15890 }, { "epoch": 2.0662768031189085, "grad_norm": 0.5799593329429626, "learning_rate": 2.9337231968810917e-05, "loss": 0.1749, "step": 15900 }, { "epoch": 2.0675763482781027, "grad_norm": 0.45801419019699097, "learning_rate": 2.9324236517218973e-05, "loss": 0.1566, "step": 15910 }, { "epoch": 2.068875893437297, "grad_norm": 0.4585916996002197, "learning_rate": 2.9311241065627032e-05, "loss": 0.1432, "step": 15920 }, { "epoch": 2.0701754385964914, "grad_norm": 1.2469173669815063, "learning_rate": 2.929824561403509e-05, "loss": 0.1824, "step": 15930 }, { "epoch": 2.0714749837556856, "grad_norm": 0.6270967721939087, "learning_rate": 2.9285250162443144e-05, "loss": 0.0889, "step": 15940 }, { "epoch": 2.07277452891488, "grad_norm": 0.32804441452026367, "learning_rate": 2.9272254710851203e-05, "loss": 0.114, "step": 15950 }, { "epoch": 2.074074074074074, "grad_norm": 0.45163092017173767, "learning_rate": 2.925925925925926e-05, "loss": 0.1226, "step": 15960 }, { "epoch": 2.075373619233268, "grad_norm": 0.594606339931488, "learning_rate": 2.924626380766732e-05, "loss": 0.1851, "step": 15970 }, { "epoch": 2.0766731643924627, "grad_norm": 0.6952602863311768, "learning_rate": 2.9233268356075378e-05, "loss": 0.1358, "step": 15980 }, { "epoch": 2.077972709551657, "grad_norm": 0.3409059941768646, "learning_rate": 2.922027290448343e-05, "loss": 0.1832, "step": 15990 }, { "epoch": 2.079272254710851, "grad_norm": 0.3592456579208374, "learning_rate": 2.920727745289149e-05, "loss": 0.1404, "step": 16000 }, { "epoch": 2.0805717998700457, "grad_norm": 0.3858984112739563, "learning_rate": 2.9194282001299545e-05, "loss": 0.1823, "step": 16010 }, { "epoch": 2.08187134502924, "grad_norm": 0.5559977889060974, "learning_rate": 2.9181286549707604e-05, "loss": 0.1554, "step": 16020 }, { "epoch": 2.083170890188434, "grad_norm": 0.48098304867744446, "learning_rate": 2.9168291098115664e-05, "loss": 0.1711, "step": 16030 }, { "epoch": 2.084470435347628, "grad_norm": 0.22287891805171967, "learning_rate": 2.9155295646523716e-05, "loss": 0.1509, "step": 16040 }, { "epoch": 2.0857699805068224, "grad_norm": 0.3370320796966553, "learning_rate": 2.9142300194931775e-05, "loss": 0.1286, "step": 16050 }, { "epoch": 2.087069525666017, "grad_norm": 0.48927041888237, "learning_rate": 2.912930474333983e-05, "loss": 0.0973, "step": 16060 }, { "epoch": 2.088369070825211, "grad_norm": 0.6718174815177917, "learning_rate": 2.911630929174789e-05, "loss": 0.1367, "step": 16070 }, { "epoch": 2.0896686159844053, "grad_norm": 0.16415958106517792, "learning_rate": 2.910331384015595e-05, "loss": 0.1562, "step": 16080 }, { "epoch": 2.0909681611436, "grad_norm": 0.4877859055995941, "learning_rate": 2.9090318388564002e-05, "loss": 0.1005, "step": 16090 }, { "epoch": 2.092267706302794, "grad_norm": 0.40357327461242676, "learning_rate": 2.907732293697206e-05, "loss": 0.1742, "step": 16100 }, { "epoch": 2.0935672514619883, "grad_norm": 0.2956276834011078, "learning_rate": 2.9064327485380117e-05, "loss": 0.1188, "step": 16110 }, { "epoch": 2.0948667966211825, "grad_norm": 0.35211649537086487, "learning_rate": 2.9051332033788176e-05, "loss": 0.117, "step": 16120 }, { "epoch": 2.0961663417803766, "grad_norm": 0.46299290657043457, "learning_rate": 2.9038336582196236e-05, "loss": 0.1085, "step": 16130 }, { "epoch": 2.0974658869395713, "grad_norm": 0.20995362102985382, "learning_rate": 2.9025341130604288e-05, "loss": 0.0997, "step": 16140 }, { "epoch": 2.0987654320987654, "grad_norm": 0.6572122573852539, "learning_rate": 2.9012345679012347e-05, "loss": 0.385, "step": 16150 }, { "epoch": 2.1000649772579596, "grad_norm": 0.482829749584198, "learning_rate": 2.8999350227420403e-05, "loss": 0.1162, "step": 16160 }, { "epoch": 2.101364522417154, "grad_norm": 0.3945275545120239, "learning_rate": 2.8986354775828463e-05, "loss": 0.169, "step": 16170 }, { "epoch": 2.1026640675763484, "grad_norm": 0.284521222114563, "learning_rate": 2.8973359324236522e-05, "loss": 0.1159, "step": 16180 }, { "epoch": 2.1039636127355426, "grad_norm": 0.3332878649234772, "learning_rate": 2.8960363872644574e-05, "loss": 0.185, "step": 16190 }, { "epoch": 2.1052631578947367, "grad_norm": 0.16479192674160004, "learning_rate": 2.8947368421052634e-05, "loss": 0.175, "step": 16200 }, { "epoch": 2.106562703053931, "grad_norm": 0.39071163535118103, "learning_rate": 2.893437296946069e-05, "loss": 0.1531, "step": 16210 }, { "epoch": 2.1078622482131255, "grad_norm": 0.6269940137863159, "learning_rate": 2.892137751786875e-05, "loss": 0.1218, "step": 16220 }, { "epoch": 2.1091617933723197, "grad_norm": 0.6498335003852844, "learning_rate": 2.8908382066276808e-05, "loss": 0.1564, "step": 16230 }, { "epoch": 2.110461338531514, "grad_norm": 0.375203400850296, "learning_rate": 2.889538661468486e-05, "loss": 0.1584, "step": 16240 }, { "epoch": 2.1117608836907085, "grad_norm": 0.45093435049057007, "learning_rate": 2.888239116309292e-05, "loss": 0.2411, "step": 16250 }, { "epoch": 2.1130604288499026, "grad_norm": 0.5191018581390381, "learning_rate": 2.8869395711500975e-05, "loss": 0.119, "step": 16260 }, { "epoch": 2.114359974009097, "grad_norm": 0.22441083192825317, "learning_rate": 2.8856400259909035e-05, "loss": 0.1021, "step": 16270 }, { "epoch": 2.115659519168291, "grad_norm": 0.627520740032196, "learning_rate": 2.8843404808317094e-05, "loss": 0.1368, "step": 16280 }, { "epoch": 2.116959064327485, "grad_norm": 0.6207196712493896, "learning_rate": 2.8830409356725146e-05, "loss": 0.1456, "step": 16290 }, { "epoch": 2.1182586094866798, "grad_norm": 0.34196093678474426, "learning_rate": 2.8817413905133206e-05, "loss": 0.1624, "step": 16300 }, { "epoch": 2.119558154645874, "grad_norm": 0.5166024565696716, "learning_rate": 2.880441845354126e-05, "loss": 0.1061, "step": 16310 }, { "epoch": 2.120857699805068, "grad_norm": 0.7249741554260254, "learning_rate": 2.879142300194932e-05, "loss": 0.1389, "step": 16320 }, { "epoch": 2.1221572449642627, "grad_norm": 0.31980645656585693, "learning_rate": 2.877842755035738e-05, "loss": 0.1396, "step": 16330 }, { "epoch": 2.123456790123457, "grad_norm": 0.3897608518600464, "learning_rate": 2.8765432098765432e-05, "loss": 0.1412, "step": 16340 }, { "epoch": 2.124756335282651, "grad_norm": 0.25489869713783264, "learning_rate": 2.875243664717349e-05, "loss": 0.1457, "step": 16350 }, { "epoch": 2.1260558804418452, "grad_norm": 0.37390950322151184, "learning_rate": 2.8739441195581544e-05, "loss": 0.1184, "step": 16360 }, { "epoch": 2.1273554256010394, "grad_norm": 0.36094629764556885, "learning_rate": 2.8726445743989607e-05, "loss": 0.1075, "step": 16370 }, { "epoch": 2.128654970760234, "grad_norm": 0.5759547352790833, "learning_rate": 2.8713450292397666e-05, "loss": 0.1182, "step": 16380 }, { "epoch": 2.129954515919428, "grad_norm": 0.5038614273071289, "learning_rate": 2.870045484080572e-05, "loss": 0.1074, "step": 16390 }, { "epoch": 2.1312540610786224, "grad_norm": 0.3105831444263458, "learning_rate": 2.8687459389213778e-05, "loss": 0.1619, "step": 16400 }, { "epoch": 2.132553606237817, "grad_norm": 0.4356473982334137, "learning_rate": 2.867446393762183e-05, "loss": 0.1479, "step": 16410 }, { "epoch": 2.133853151397011, "grad_norm": 0.5382697582244873, "learning_rate": 2.866146848602989e-05, "loss": 0.1481, "step": 16420 }, { "epoch": 2.1351526965562053, "grad_norm": 0.28396251797676086, "learning_rate": 2.8648473034437952e-05, "loss": 0.1623, "step": 16430 }, { "epoch": 2.1364522417153995, "grad_norm": 0.6900269985198975, "learning_rate": 2.8635477582846005e-05, "loss": 0.2111, "step": 16440 }, { "epoch": 2.1377517868745937, "grad_norm": 0.6330050826072693, "learning_rate": 2.8622482131254064e-05, "loss": 0.1959, "step": 16450 }, { "epoch": 2.1390513320337883, "grad_norm": 0.29641857743263245, "learning_rate": 2.8609486679662116e-05, "loss": 0.1391, "step": 16460 }, { "epoch": 2.1403508771929824, "grad_norm": 0.5468245148658752, "learning_rate": 2.8596491228070175e-05, "loss": 0.1538, "step": 16470 }, { "epoch": 2.1416504223521766, "grad_norm": 0.7489967942237854, "learning_rate": 2.8583495776478235e-05, "loss": 0.2441, "step": 16480 }, { "epoch": 2.1429499675113712, "grad_norm": 0.22202546894550323, "learning_rate": 2.857050032488629e-05, "loss": 0.1804, "step": 16490 }, { "epoch": 2.1442495126705654, "grad_norm": 0.9586085677146912, "learning_rate": 2.855750487329435e-05, "loss": 0.1383, "step": 16500 }, { "epoch": 2.1455490578297596, "grad_norm": 0.9647799730300903, "learning_rate": 2.8544509421702402e-05, "loss": 0.1631, "step": 16510 }, { "epoch": 2.1468486029889537, "grad_norm": 0.20107677578926086, "learning_rate": 2.853151397011046e-05, "loss": 0.0916, "step": 16520 }, { "epoch": 2.148148148148148, "grad_norm": 0.4246368706226349, "learning_rate": 2.851851851851852e-05, "loss": 0.1416, "step": 16530 }, { "epoch": 2.1494476933073425, "grad_norm": 0.4454677700996399, "learning_rate": 2.8505523066926577e-05, "loss": 0.1645, "step": 16540 }, { "epoch": 2.1507472384665367, "grad_norm": 0.5902255177497864, "learning_rate": 2.8492527615334636e-05, "loss": 0.1292, "step": 16550 }, { "epoch": 2.152046783625731, "grad_norm": 0.17026260495185852, "learning_rate": 2.847953216374269e-05, "loss": 0.1462, "step": 16560 }, { "epoch": 2.1533463287849255, "grad_norm": 1.1530053615570068, "learning_rate": 2.8466536712150748e-05, "loss": 0.1674, "step": 16570 }, { "epoch": 2.1546458739441197, "grad_norm": 0.294809490442276, "learning_rate": 2.8453541260558807e-05, "loss": 0.135, "step": 16580 }, { "epoch": 2.155945419103314, "grad_norm": 0.6989735960960388, "learning_rate": 2.8440545808966863e-05, "loss": 0.1593, "step": 16590 }, { "epoch": 2.157244964262508, "grad_norm": 0.5469947457313538, "learning_rate": 2.8427550357374922e-05, "loss": 0.0979, "step": 16600 }, { "epoch": 2.158544509421702, "grad_norm": 0.635773241519928, "learning_rate": 2.8414554905782974e-05, "loss": 0.1299, "step": 16610 }, { "epoch": 2.159844054580897, "grad_norm": 0.6304984092712402, "learning_rate": 2.8401559454191034e-05, "loss": 0.1969, "step": 16620 }, { "epoch": 2.161143599740091, "grad_norm": 0.7570547461509705, "learning_rate": 2.8388564002599093e-05, "loss": 0.1822, "step": 16630 }, { "epoch": 2.162443144899285, "grad_norm": 0.9788710474967957, "learning_rate": 2.837556855100715e-05, "loss": 0.1389, "step": 16640 }, { "epoch": 2.1637426900584797, "grad_norm": 0.4087946116924286, "learning_rate": 2.8362573099415208e-05, "loss": 0.1495, "step": 16650 }, { "epoch": 2.165042235217674, "grad_norm": 0.3145006000995636, "learning_rate": 2.834957764782326e-05, "loss": 0.128, "step": 16660 }, { "epoch": 2.166341780376868, "grad_norm": 0.924323558807373, "learning_rate": 2.833658219623132e-05, "loss": 0.1408, "step": 16670 }, { "epoch": 2.1676413255360623, "grad_norm": 0.5125880837440491, "learning_rate": 2.832358674463938e-05, "loss": 0.1197, "step": 16680 }, { "epoch": 2.1689408706952564, "grad_norm": 0.3911534249782562, "learning_rate": 2.8310591293047435e-05, "loss": 0.1227, "step": 16690 }, { "epoch": 2.170240415854451, "grad_norm": 0.2046302855014801, "learning_rate": 2.8297595841455494e-05, "loss": 0.1645, "step": 16700 }, { "epoch": 2.171539961013645, "grad_norm": 0.44029560685157776, "learning_rate": 2.8284600389863547e-05, "loss": 0.1476, "step": 16710 }, { "epoch": 2.1728395061728394, "grad_norm": 0.3483363389968872, "learning_rate": 2.8271604938271606e-05, "loss": 0.1189, "step": 16720 }, { "epoch": 2.174139051332034, "grad_norm": 1.3779213428497314, "learning_rate": 2.8258609486679665e-05, "loss": 0.1453, "step": 16730 }, { "epoch": 2.175438596491228, "grad_norm": 0.5638744235038757, "learning_rate": 2.824561403508772e-05, "loss": 0.144, "step": 16740 }, { "epoch": 2.1767381416504223, "grad_norm": 0.5420145988464355, "learning_rate": 2.823261858349578e-05, "loss": 0.1399, "step": 16750 }, { "epoch": 2.1780376868096165, "grad_norm": 0.8909449577331543, "learning_rate": 2.8219623131903833e-05, "loss": 0.1738, "step": 16760 }, { "epoch": 2.1793372319688107, "grad_norm": 0.4650874733924866, "learning_rate": 2.8206627680311892e-05, "loss": 0.1191, "step": 16770 }, { "epoch": 2.1806367771280053, "grad_norm": 0.6424671411514282, "learning_rate": 2.819363222871995e-05, "loss": 0.2599, "step": 16780 }, { "epoch": 2.1819363222871995, "grad_norm": 0.8589619994163513, "learning_rate": 2.8180636777128007e-05, "loss": 0.2044, "step": 16790 }, { "epoch": 2.1832358674463936, "grad_norm": 0.20612066984176636, "learning_rate": 2.8167641325536066e-05, "loss": 0.0898, "step": 16800 }, { "epoch": 2.1845354126055883, "grad_norm": 0.44053080677986145, "learning_rate": 2.815464587394412e-05, "loss": 0.1097, "step": 16810 }, { "epoch": 2.1858349577647824, "grad_norm": 0.5138782262802124, "learning_rate": 2.8141650422352178e-05, "loss": 0.1229, "step": 16820 }, { "epoch": 2.1871345029239766, "grad_norm": 0.858270525932312, "learning_rate": 2.8128654970760237e-05, "loss": 0.1545, "step": 16830 }, { "epoch": 2.1884340480831708, "grad_norm": 0.41714441776275635, "learning_rate": 2.8115659519168293e-05, "loss": 0.1271, "step": 16840 }, { "epoch": 2.189733593242365, "grad_norm": 0.3828919529914856, "learning_rate": 2.8102664067576352e-05, "loss": 0.139, "step": 16850 }, { "epoch": 2.1910331384015596, "grad_norm": 0.13904297351837158, "learning_rate": 2.8089668615984405e-05, "loss": 0.1388, "step": 16860 }, { "epoch": 2.1923326835607537, "grad_norm": 0.2652682065963745, "learning_rate": 2.8076673164392464e-05, "loss": 0.15, "step": 16870 }, { "epoch": 2.193632228719948, "grad_norm": 0.4026249051094055, "learning_rate": 2.8063677712800523e-05, "loss": 0.1164, "step": 16880 }, { "epoch": 2.1949317738791425, "grad_norm": 0.7563934922218323, "learning_rate": 2.805068226120858e-05, "loss": 0.1624, "step": 16890 }, { "epoch": 2.1962313190383367, "grad_norm": 0.9158092141151428, "learning_rate": 2.8037686809616638e-05, "loss": 0.1613, "step": 16900 }, { "epoch": 2.197530864197531, "grad_norm": 0.9388392567634583, "learning_rate": 2.802469135802469e-05, "loss": 0.2057, "step": 16910 }, { "epoch": 2.198830409356725, "grad_norm": 0.22922798991203308, "learning_rate": 2.801169590643275e-05, "loss": 0.1124, "step": 16920 }, { "epoch": 2.200129954515919, "grad_norm": 0.48792973160743713, "learning_rate": 2.799870045484081e-05, "loss": 0.1356, "step": 16930 }, { "epoch": 2.201429499675114, "grad_norm": 1.192221760749817, "learning_rate": 2.7985705003248862e-05, "loss": 0.1305, "step": 16940 }, { "epoch": 2.202729044834308, "grad_norm": 1.0980403423309326, "learning_rate": 2.7972709551656924e-05, "loss": 0.1977, "step": 16950 }, { "epoch": 2.204028589993502, "grad_norm": 0.5934943556785583, "learning_rate": 2.7959714100064977e-05, "loss": 0.1495, "step": 16960 }, { "epoch": 2.2053281351526968, "grad_norm": 0.20551812648773193, "learning_rate": 2.7946718648473036e-05, "loss": 0.1333, "step": 16970 }, { "epoch": 2.206627680311891, "grad_norm": 0.18006043136119843, "learning_rate": 2.7933723196881095e-05, "loss": 0.1214, "step": 16980 }, { "epoch": 2.207927225471085, "grad_norm": 0.5070958137512207, "learning_rate": 2.7920727745289148e-05, "loss": 0.166, "step": 16990 }, { "epoch": 2.2092267706302793, "grad_norm": 0.33393484354019165, "learning_rate": 2.7907732293697207e-05, "loss": 0.1467, "step": 17000 }, { "epoch": 2.2105263157894735, "grad_norm": 0.2343466877937317, "learning_rate": 2.7894736842105263e-05, "loss": 0.1736, "step": 17010 }, { "epoch": 2.211825860948668, "grad_norm": 0.6706521511077881, "learning_rate": 2.7881741390513322e-05, "loss": 0.1687, "step": 17020 }, { "epoch": 2.2131254061078622, "grad_norm": 1.7708979845046997, "learning_rate": 2.786874593892138e-05, "loss": 0.1869, "step": 17030 }, { "epoch": 2.2144249512670564, "grad_norm": 0.7349026799201965, "learning_rate": 2.7855750487329434e-05, "loss": 0.2158, "step": 17040 }, { "epoch": 2.215724496426251, "grad_norm": 0.6819010972976685, "learning_rate": 2.7842755035737493e-05, "loss": 0.2342, "step": 17050 }, { "epoch": 2.217024041585445, "grad_norm": 0.16676035523414612, "learning_rate": 2.782975958414555e-05, "loss": 0.1134, "step": 17060 }, { "epoch": 2.2183235867446394, "grad_norm": 0.2694058418273926, "learning_rate": 2.7816764132553608e-05, "loss": 0.118, "step": 17070 }, { "epoch": 2.2196231319038335, "grad_norm": 0.526313841342926, "learning_rate": 2.7803768680961667e-05, "loss": 0.088, "step": 17080 }, { "epoch": 2.2209226770630277, "grad_norm": 0.8220738172531128, "learning_rate": 2.779077322936972e-05, "loss": 0.0998, "step": 17090 }, { "epoch": 2.2222222222222223, "grad_norm": 1.1011462211608887, "learning_rate": 2.777777777777778e-05, "loss": 0.1766, "step": 17100 }, { "epoch": 2.2235217673814165, "grad_norm": 0.45535317063331604, "learning_rate": 2.7764782326185835e-05, "loss": 0.1333, "step": 17110 }, { "epoch": 2.2248213125406107, "grad_norm": 0.644749641418457, "learning_rate": 2.7751786874593894e-05, "loss": 0.1232, "step": 17120 }, { "epoch": 2.2261208576998053, "grad_norm": 0.6584835648536682, "learning_rate": 2.7738791423001953e-05, "loss": 0.1311, "step": 17130 }, { "epoch": 2.2274204028589994, "grad_norm": 0.264278769493103, "learning_rate": 2.7725795971410006e-05, "loss": 0.1106, "step": 17140 }, { "epoch": 2.2287199480181936, "grad_norm": 0.6769689321517944, "learning_rate": 2.7712800519818065e-05, "loss": 0.1715, "step": 17150 }, { "epoch": 2.230019493177388, "grad_norm": 0.2856903076171875, "learning_rate": 2.769980506822612e-05, "loss": 0.1603, "step": 17160 }, { "epoch": 2.231319038336582, "grad_norm": 0.5065121650695801, "learning_rate": 2.768680961663418e-05, "loss": 0.1641, "step": 17170 }, { "epoch": 2.2326185834957766, "grad_norm": 0.8379623889923096, "learning_rate": 2.767381416504224e-05, "loss": 0.0942, "step": 17180 }, { "epoch": 2.2339181286549707, "grad_norm": 0.39154180884361267, "learning_rate": 2.7660818713450292e-05, "loss": 0.1767, "step": 17190 }, { "epoch": 2.235217673814165, "grad_norm": 0.17083945870399475, "learning_rate": 2.764782326185835e-05, "loss": 0.1342, "step": 17200 }, { "epoch": 2.2365172189733595, "grad_norm": 0.3927488923072815, "learning_rate": 2.7634827810266407e-05, "loss": 0.1605, "step": 17210 }, { "epoch": 2.2378167641325537, "grad_norm": 0.4504440724849701, "learning_rate": 2.7621832358674466e-05, "loss": 0.233, "step": 17220 }, { "epoch": 2.239116309291748, "grad_norm": 1.0428270101547241, "learning_rate": 2.7608836907082526e-05, "loss": 0.158, "step": 17230 }, { "epoch": 2.240415854450942, "grad_norm": 0.5633832812309265, "learning_rate": 2.7595841455490578e-05, "loss": 0.1019, "step": 17240 }, { "epoch": 2.241715399610136, "grad_norm": 0.2267511934041977, "learning_rate": 2.7582846003898637e-05, "loss": 0.1835, "step": 17250 }, { "epoch": 2.243014944769331, "grad_norm": 0.49485158920288086, "learning_rate": 2.7569850552306693e-05, "loss": 0.1063, "step": 17260 }, { "epoch": 2.244314489928525, "grad_norm": 0.6672847867012024, "learning_rate": 2.7556855100714752e-05, "loss": 0.1497, "step": 17270 }, { "epoch": 2.245614035087719, "grad_norm": 0.48239195346832275, "learning_rate": 2.754385964912281e-05, "loss": 0.1488, "step": 17280 }, { "epoch": 2.246913580246914, "grad_norm": 0.29161882400512695, "learning_rate": 2.7530864197530864e-05, "loss": 0.1174, "step": 17290 }, { "epoch": 2.248213125406108, "grad_norm": 0.3051050901412964, "learning_rate": 2.7517868745938923e-05, "loss": 0.1125, "step": 17300 }, { "epoch": 2.249512670565302, "grad_norm": 0.23346132040023804, "learning_rate": 2.750487329434698e-05, "loss": 0.1173, "step": 17310 }, { "epoch": 2.2508122157244963, "grad_norm": 0.3097531199455261, "learning_rate": 2.749187784275504e-05, "loss": 0.1616, "step": 17320 }, { "epoch": 2.2521117608836905, "grad_norm": 0.6086621284484863, "learning_rate": 2.7478882391163098e-05, "loss": 0.1309, "step": 17330 }, { "epoch": 2.253411306042885, "grad_norm": 0.6774113774299622, "learning_rate": 2.746588693957115e-05, "loss": 0.1057, "step": 17340 }, { "epoch": 2.2547108512020793, "grad_norm": 0.34792831540107727, "learning_rate": 2.745289148797921e-05, "loss": 0.1476, "step": 17350 }, { "epoch": 2.2560103963612734, "grad_norm": 1.0342179536819458, "learning_rate": 2.7439896036387265e-05, "loss": 0.158, "step": 17360 }, { "epoch": 2.257309941520468, "grad_norm": 0.9049148559570312, "learning_rate": 2.7426900584795324e-05, "loss": 0.1727, "step": 17370 }, { "epoch": 2.258609486679662, "grad_norm": 0.5906630754470825, "learning_rate": 2.7413905133203384e-05, "loss": 0.1523, "step": 17380 }, { "epoch": 2.2599090318388564, "grad_norm": 0.20486943423748016, "learning_rate": 2.7400909681611436e-05, "loss": 0.1635, "step": 17390 }, { "epoch": 2.2612085769980506, "grad_norm": 1.129967212677002, "learning_rate": 2.7387914230019495e-05, "loss": 0.2258, "step": 17400 }, { "epoch": 2.2625081221572447, "grad_norm": 0.39960843324661255, "learning_rate": 2.7374918778427548e-05, "loss": 0.1194, "step": 17410 }, { "epoch": 2.2638076673164393, "grad_norm": 1.3820998668670654, "learning_rate": 2.736192332683561e-05, "loss": 0.1903, "step": 17420 }, { "epoch": 2.2651072124756335, "grad_norm": 0.2660486400127411, "learning_rate": 2.734892787524367e-05, "loss": 0.1304, "step": 17430 }, { "epoch": 2.2664067576348277, "grad_norm": 0.5102384686470032, "learning_rate": 2.7335932423651722e-05, "loss": 0.1179, "step": 17440 }, { "epoch": 2.2677063027940223, "grad_norm": 0.254859983921051, "learning_rate": 2.732293697205978e-05, "loss": 0.1155, "step": 17450 }, { "epoch": 2.2690058479532165, "grad_norm": 0.6288928389549255, "learning_rate": 2.7309941520467834e-05, "loss": 0.1043, "step": 17460 }, { "epoch": 2.2703053931124106, "grad_norm": 1.8990604877471924, "learning_rate": 2.7296946068875893e-05, "loss": 0.2598, "step": 17470 }, { "epoch": 2.271604938271605, "grad_norm": 0.23880353569984436, "learning_rate": 2.7283950617283956e-05, "loss": 0.1803, "step": 17480 }, { "epoch": 2.272904483430799, "grad_norm": 0.2074812352657318, "learning_rate": 2.727095516569201e-05, "loss": 0.1809, "step": 17490 }, { "epoch": 2.2742040285899936, "grad_norm": 0.4894918203353882, "learning_rate": 2.7257959714100068e-05, "loss": 0.158, "step": 17500 }, { "epoch": 2.2755035737491878, "grad_norm": 0.30930081009864807, "learning_rate": 2.724496426250812e-05, "loss": 0.1568, "step": 17510 }, { "epoch": 2.276803118908382, "grad_norm": 0.7343089580535889, "learning_rate": 2.723196881091618e-05, "loss": 0.2337, "step": 17520 }, { "epoch": 2.2781026640675766, "grad_norm": 0.8892118334770203, "learning_rate": 2.721897335932424e-05, "loss": 0.1398, "step": 17530 }, { "epoch": 2.2794022092267707, "grad_norm": 0.727379560470581, "learning_rate": 2.7205977907732294e-05, "loss": 0.1434, "step": 17540 }, { "epoch": 2.280701754385965, "grad_norm": 0.6517507433891296, "learning_rate": 2.7192982456140354e-05, "loss": 0.1282, "step": 17550 }, { "epoch": 2.282001299545159, "grad_norm": 0.3229377269744873, "learning_rate": 2.7179987004548406e-05, "loss": 0.1924, "step": 17560 }, { "epoch": 2.2833008447043532, "grad_norm": 0.36913856863975525, "learning_rate": 2.7166991552956465e-05, "loss": 0.1173, "step": 17570 }, { "epoch": 2.284600389863548, "grad_norm": 0.693495512008667, "learning_rate": 2.7153996101364525e-05, "loss": 0.1174, "step": 17580 }, { "epoch": 2.285899935022742, "grad_norm": 1.0938019752502441, "learning_rate": 2.714100064977258e-05, "loss": 0.1325, "step": 17590 }, { "epoch": 2.287199480181936, "grad_norm": 0.7431447505950928, "learning_rate": 2.712800519818064e-05, "loss": 0.1459, "step": 17600 }, { "epoch": 2.288499025341131, "grad_norm": 0.6116625666618347, "learning_rate": 2.7115009746588692e-05, "loss": 0.1336, "step": 17610 }, { "epoch": 2.289798570500325, "grad_norm": 0.4304735064506531, "learning_rate": 2.710201429499675e-05, "loss": 0.1073, "step": 17620 }, { "epoch": 2.291098115659519, "grad_norm": 0.801864504814148, "learning_rate": 2.708901884340481e-05, "loss": 0.1331, "step": 17630 }, { "epoch": 2.2923976608187133, "grad_norm": 0.32164880633354187, "learning_rate": 2.7076023391812866e-05, "loss": 0.1401, "step": 17640 }, { "epoch": 2.2936972059779075, "grad_norm": 0.31390681862831116, "learning_rate": 2.7063027940220926e-05, "loss": 0.1648, "step": 17650 }, { "epoch": 2.294996751137102, "grad_norm": 0.2994040250778198, "learning_rate": 2.7050032488628978e-05, "loss": 0.1208, "step": 17660 }, { "epoch": 2.2962962962962963, "grad_norm": 0.43916577100753784, "learning_rate": 2.7037037037037037e-05, "loss": 0.1121, "step": 17670 }, { "epoch": 2.2975958414554905, "grad_norm": 0.263508677482605, "learning_rate": 2.7024041585445097e-05, "loss": 0.1837, "step": 17680 }, { "epoch": 2.298895386614685, "grad_norm": 0.17791792750358582, "learning_rate": 2.7011046133853153e-05, "loss": 0.147, "step": 17690 }, { "epoch": 2.3001949317738792, "grad_norm": 0.9349750876426697, "learning_rate": 2.6998050682261212e-05, "loss": 0.1965, "step": 17700 }, { "epoch": 2.3014944769330734, "grad_norm": 0.36618295311927795, "learning_rate": 2.6985055230669264e-05, "loss": 0.1128, "step": 17710 }, { "epoch": 2.3027940220922676, "grad_norm": 0.4422851502895355, "learning_rate": 2.6972059779077323e-05, "loss": 0.1175, "step": 17720 }, { "epoch": 2.3040935672514617, "grad_norm": 0.43622520565986633, "learning_rate": 2.6959064327485383e-05, "loss": 0.1346, "step": 17730 }, { "epoch": 2.3053931124106564, "grad_norm": 0.5269821882247925, "learning_rate": 2.694606887589344e-05, "loss": 0.1081, "step": 17740 }, { "epoch": 2.3066926575698505, "grad_norm": 1.135745644569397, "learning_rate": 2.6933073424301498e-05, "loss": 0.1545, "step": 17750 }, { "epoch": 2.3079922027290447, "grad_norm": 0.7358306050300598, "learning_rate": 2.692007797270955e-05, "loss": 0.1067, "step": 17760 }, { "epoch": 2.3092917478882393, "grad_norm": 1.014992594718933, "learning_rate": 2.690708252111761e-05, "loss": 0.1399, "step": 17770 }, { "epoch": 2.3105912930474335, "grad_norm": 0.7363002896308899, "learning_rate": 2.689408706952567e-05, "loss": 0.1556, "step": 17780 }, { "epoch": 2.3118908382066277, "grad_norm": 0.730646550655365, "learning_rate": 2.6881091617933725e-05, "loss": 0.1706, "step": 17790 }, { "epoch": 2.313190383365822, "grad_norm": 0.5053973197937012, "learning_rate": 2.6868096166341784e-05, "loss": 0.1252, "step": 17800 }, { "epoch": 2.314489928525016, "grad_norm": 0.21022523939609528, "learning_rate": 2.6855100714749836e-05, "loss": 0.1252, "step": 17810 }, { "epoch": 2.3157894736842106, "grad_norm": 0.4393979609012604, "learning_rate": 2.6842105263157896e-05, "loss": 0.1634, "step": 17820 }, { "epoch": 2.317089018843405, "grad_norm": 0.7046792507171631, "learning_rate": 2.6829109811565955e-05, "loss": 0.1876, "step": 17830 }, { "epoch": 2.318388564002599, "grad_norm": 0.4438394606113434, "learning_rate": 2.681611435997401e-05, "loss": 0.1842, "step": 17840 }, { "epoch": 2.3196881091617936, "grad_norm": 0.1426054984331131, "learning_rate": 2.680311890838207e-05, "loss": 0.0775, "step": 17850 }, { "epoch": 2.3209876543209877, "grad_norm": 0.3332922160625458, "learning_rate": 2.6790123456790122e-05, "loss": 0.1201, "step": 17860 }, { "epoch": 2.322287199480182, "grad_norm": 0.7243021726608276, "learning_rate": 2.677712800519818e-05, "loss": 0.1645, "step": 17870 }, { "epoch": 2.323586744639376, "grad_norm": 0.2822357714176178, "learning_rate": 2.676413255360624e-05, "loss": 0.1663, "step": 17880 }, { "epoch": 2.3248862897985703, "grad_norm": 0.3108772039413452, "learning_rate": 2.6751137102014297e-05, "loss": 0.118, "step": 17890 }, { "epoch": 2.326185834957765, "grad_norm": 0.4276488721370697, "learning_rate": 2.6738141650422356e-05, "loss": 0.1302, "step": 17900 }, { "epoch": 2.327485380116959, "grad_norm": 0.3077802062034607, "learning_rate": 2.672514619883041e-05, "loss": 0.0887, "step": 17910 }, { "epoch": 2.328784925276153, "grad_norm": 0.7009322047233582, "learning_rate": 2.6712150747238468e-05, "loss": 0.1682, "step": 17920 }, { "epoch": 2.330084470435348, "grad_norm": 0.7829368114471436, "learning_rate": 2.6699155295646527e-05, "loss": 0.1428, "step": 17930 }, { "epoch": 2.331384015594542, "grad_norm": 0.3944217562675476, "learning_rate": 2.6686159844054583e-05, "loss": 0.1112, "step": 17940 }, { "epoch": 2.332683560753736, "grad_norm": 0.42813318967819214, "learning_rate": 2.6673164392462642e-05, "loss": 0.1336, "step": 17950 }, { "epoch": 2.3339831059129303, "grad_norm": 0.8001407980918884, "learning_rate": 2.6660168940870695e-05, "loss": 0.1429, "step": 17960 }, { "epoch": 2.3352826510721245, "grad_norm": 0.3671790659427643, "learning_rate": 2.6647173489278754e-05, "loss": 0.1121, "step": 17970 }, { "epoch": 2.336582196231319, "grad_norm": 0.49975842237472534, "learning_rate": 2.6634178037686813e-05, "loss": 0.1702, "step": 17980 }, { "epoch": 2.3378817413905133, "grad_norm": 0.32665425539016724, "learning_rate": 2.6621182586094865e-05, "loss": 0.1051, "step": 17990 }, { "epoch": 2.3391812865497075, "grad_norm": 0.9178919196128845, "learning_rate": 2.6608187134502928e-05, "loss": 0.1413, "step": 18000 }, { "epoch": 2.340480831708902, "grad_norm": 0.5944603681564331, "learning_rate": 2.659519168291098e-05, "loss": 0.1162, "step": 18010 }, { "epoch": 2.3417803768680963, "grad_norm": 0.4394367039203644, "learning_rate": 2.658219623131904e-05, "loss": 0.1724, "step": 18020 }, { "epoch": 2.3430799220272904, "grad_norm": 0.8241539001464844, "learning_rate": 2.65692007797271e-05, "loss": 0.1896, "step": 18030 }, { "epoch": 2.3443794671864846, "grad_norm": 0.6052781939506531, "learning_rate": 2.655620532813515e-05, "loss": 0.1588, "step": 18040 }, { "epoch": 2.3456790123456788, "grad_norm": 0.1499132215976715, "learning_rate": 2.654320987654321e-05, "loss": 0.1361, "step": 18050 }, { "epoch": 2.3469785575048734, "grad_norm": 0.5708144307136536, "learning_rate": 2.6530214424951267e-05, "loss": 0.1133, "step": 18060 }, { "epoch": 2.3482781026640676, "grad_norm": 0.6428653001785278, "learning_rate": 2.6517218973359326e-05, "loss": 0.1162, "step": 18070 }, { "epoch": 2.3495776478232617, "grad_norm": 0.1617823988199234, "learning_rate": 2.6504223521767385e-05, "loss": 0.1608, "step": 18080 }, { "epoch": 2.3508771929824563, "grad_norm": 0.25945839285850525, "learning_rate": 2.6491228070175438e-05, "loss": 0.0779, "step": 18090 }, { "epoch": 2.3521767381416505, "grad_norm": 0.40474942326545715, "learning_rate": 2.6478232618583497e-05, "loss": 0.1257, "step": 18100 }, { "epoch": 2.3534762833008447, "grad_norm": 0.6648324131965637, "learning_rate": 2.6465237166991553e-05, "loss": 0.1561, "step": 18110 }, { "epoch": 2.354775828460039, "grad_norm": 0.10207844525575638, "learning_rate": 2.6452241715399612e-05, "loss": 0.1069, "step": 18120 }, { "epoch": 2.356075373619233, "grad_norm": 0.644662618637085, "learning_rate": 2.643924626380767e-05, "loss": 0.1725, "step": 18130 }, { "epoch": 2.3573749187784276, "grad_norm": 0.17888614535331726, "learning_rate": 2.6426250812215724e-05, "loss": 0.1496, "step": 18140 }, { "epoch": 2.358674463937622, "grad_norm": 0.9150230884552002, "learning_rate": 2.6413255360623783e-05, "loss": 0.1181, "step": 18150 }, { "epoch": 2.359974009096816, "grad_norm": 0.17107780277729034, "learning_rate": 2.640025990903184e-05, "loss": 0.1732, "step": 18160 }, { "epoch": 2.3612735542560106, "grad_norm": 0.2882084846496582, "learning_rate": 2.6387264457439898e-05, "loss": 0.109, "step": 18170 }, { "epoch": 2.3625730994152048, "grad_norm": 0.39812368154525757, "learning_rate": 2.6374269005847957e-05, "loss": 0.1069, "step": 18180 }, { "epoch": 2.363872644574399, "grad_norm": 0.984254777431488, "learning_rate": 2.636127355425601e-05, "loss": 0.1768, "step": 18190 }, { "epoch": 2.365172189733593, "grad_norm": 0.41295260190963745, "learning_rate": 2.634827810266407e-05, "loss": 0.1309, "step": 18200 }, { "epoch": 2.3664717348927873, "grad_norm": 1.618194818496704, "learning_rate": 2.6335282651072125e-05, "loss": 0.161, "step": 18210 }, { "epoch": 2.367771280051982, "grad_norm": 0.4676576256752014, "learning_rate": 2.6322287199480184e-05, "loss": 0.1731, "step": 18220 }, { "epoch": 2.369070825211176, "grad_norm": 0.3791716992855072, "learning_rate": 2.6309291747888243e-05, "loss": 0.1757, "step": 18230 }, { "epoch": 2.3703703703703702, "grad_norm": 0.48546674847602844, "learning_rate": 2.6296296296296296e-05, "loss": 0.1589, "step": 18240 }, { "epoch": 2.371669915529565, "grad_norm": 1.0922486782073975, "learning_rate": 2.6283300844704355e-05, "loss": 0.1538, "step": 18250 }, { "epoch": 2.372969460688759, "grad_norm": 1.5687587261199951, "learning_rate": 2.627030539311241e-05, "loss": 0.1516, "step": 18260 }, { "epoch": 2.374269005847953, "grad_norm": 0.4238356351852417, "learning_rate": 2.625730994152047e-05, "loss": 0.2054, "step": 18270 }, { "epoch": 2.3755685510071474, "grad_norm": 0.4506850242614746, "learning_rate": 2.624431448992853e-05, "loss": 0.1365, "step": 18280 }, { "epoch": 2.3768680961663415, "grad_norm": 0.34603580832481384, "learning_rate": 2.6231319038336582e-05, "loss": 0.1048, "step": 18290 }, { "epoch": 2.378167641325536, "grad_norm": 0.4600529968738556, "learning_rate": 2.621832358674464e-05, "loss": 0.1052, "step": 18300 }, { "epoch": 2.3794671864847303, "grad_norm": 0.7442044019699097, "learning_rate": 2.6205328135152697e-05, "loss": 0.1709, "step": 18310 }, { "epoch": 2.3807667316439245, "grad_norm": 0.3970494568347931, "learning_rate": 2.6192332683560756e-05, "loss": 0.1278, "step": 18320 }, { "epoch": 2.382066276803119, "grad_norm": 0.44776445627212524, "learning_rate": 2.6179337231968815e-05, "loss": 0.173, "step": 18330 }, { "epoch": 2.3833658219623133, "grad_norm": 0.41407209634780884, "learning_rate": 2.6166341780376868e-05, "loss": 0.1208, "step": 18340 }, { "epoch": 2.3846653671215075, "grad_norm": 0.31252920627593994, "learning_rate": 2.6153346328784927e-05, "loss": 0.1341, "step": 18350 }, { "epoch": 2.3859649122807016, "grad_norm": 0.2936232388019562, "learning_rate": 2.6140350877192983e-05, "loss": 0.128, "step": 18360 }, { "epoch": 2.387264457439896, "grad_norm": 0.5615895390510559, "learning_rate": 2.6127355425601042e-05, "loss": 0.1824, "step": 18370 }, { "epoch": 2.3885640025990904, "grad_norm": 0.25393590331077576, "learning_rate": 2.61143599740091e-05, "loss": 0.1243, "step": 18380 }, { "epoch": 2.3898635477582846, "grad_norm": 0.2626432478427887, "learning_rate": 2.6101364522417154e-05, "loss": 0.1498, "step": 18390 }, { "epoch": 2.3911630929174787, "grad_norm": 0.6261804103851318, "learning_rate": 2.6088369070825213e-05, "loss": 0.1849, "step": 18400 }, { "epoch": 2.3924626380766734, "grad_norm": 0.3015376031398773, "learning_rate": 2.607537361923327e-05, "loss": 0.1132, "step": 18410 }, { "epoch": 2.3937621832358675, "grad_norm": 0.7158381342887878, "learning_rate": 2.6062378167641328e-05, "loss": 0.2099, "step": 18420 }, { "epoch": 2.3950617283950617, "grad_norm": 0.5463991165161133, "learning_rate": 2.6049382716049388e-05, "loss": 0.0991, "step": 18430 }, { "epoch": 2.396361273554256, "grad_norm": 0.7470540404319763, "learning_rate": 2.603638726445744e-05, "loss": 0.1733, "step": 18440 }, { "epoch": 2.39766081871345, "grad_norm": 0.4632083773612976, "learning_rate": 2.60233918128655e-05, "loss": 0.1737, "step": 18450 }, { "epoch": 2.3989603638726447, "grad_norm": 0.4652135968208313, "learning_rate": 2.6010396361273555e-05, "loss": 0.114, "step": 18460 }, { "epoch": 2.400259909031839, "grad_norm": 0.864936888217926, "learning_rate": 2.5997400909681614e-05, "loss": 0.1281, "step": 18470 }, { "epoch": 2.401559454191033, "grad_norm": 0.5250647664070129, "learning_rate": 2.5984405458089674e-05, "loss": 0.1387, "step": 18480 }, { "epoch": 2.4028589993502276, "grad_norm": 0.5105955004692078, "learning_rate": 2.5971410006497726e-05, "loss": 0.1691, "step": 18490 }, { "epoch": 2.404158544509422, "grad_norm": 1.129549264907837, "learning_rate": 2.5958414554905785e-05, "loss": 0.1458, "step": 18500 }, { "epoch": 2.405458089668616, "grad_norm": 0.6289495825767517, "learning_rate": 2.5945419103313838e-05, "loss": 0.1211, "step": 18510 }, { "epoch": 2.40675763482781, "grad_norm": 0.3526167869567871, "learning_rate": 2.59324236517219e-05, "loss": 0.1372, "step": 18520 }, { "epoch": 2.4080571799870043, "grad_norm": 0.48680028319358826, "learning_rate": 2.591942820012996e-05, "loss": 0.1205, "step": 18530 }, { "epoch": 2.409356725146199, "grad_norm": 0.606815755367279, "learning_rate": 2.5906432748538012e-05, "loss": 0.2346, "step": 18540 }, { "epoch": 2.410656270305393, "grad_norm": 0.4979795217514038, "learning_rate": 2.589343729694607e-05, "loss": 0.1314, "step": 18550 }, { "epoch": 2.4119558154645873, "grad_norm": 0.34787625074386597, "learning_rate": 2.5880441845354124e-05, "loss": 0.1135, "step": 18560 }, { "epoch": 2.413255360623782, "grad_norm": 0.47099539637565613, "learning_rate": 2.5867446393762183e-05, "loss": 0.1008, "step": 18570 }, { "epoch": 2.414554905782976, "grad_norm": 0.5399274230003357, "learning_rate": 2.5854450942170246e-05, "loss": 0.1799, "step": 18580 }, { "epoch": 2.41585445094217, "grad_norm": 0.3384268581867218, "learning_rate": 2.5841455490578298e-05, "loss": 0.1159, "step": 18590 }, { "epoch": 2.4171539961013644, "grad_norm": 0.3352048993110657, "learning_rate": 2.5828460038986357e-05, "loss": 0.1365, "step": 18600 }, { "epoch": 2.4184535412605586, "grad_norm": 0.48148053884506226, "learning_rate": 2.581546458739441e-05, "loss": 0.1374, "step": 18610 }, { "epoch": 2.419753086419753, "grad_norm": 0.7791986465454102, "learning_rate": 2.580246913580247e-05, "loss": 0.1025, "step": 18620 }, { "epoch": 2.4210526315789473, "grad_norm": 0.14681562781333923, "learning_rate": 2.578947368421053e-05, "loss": 0.1155, "step": 18630 }, { "epoch": 2.4223521767381415, "grad_norm": 0.1768292933702469, "learning_rate": 2.5776478232618584e-05, "loss": 0.1066, "step": 18640 }, { "epoch": 2.423651721897336, "grad_norm": 0.7861000299453735, "learning_rate": 2.5763482781026643e-05, "loss": 0.1425, "step": 18650 }, { "epoch": 2.4249512670565303, "grad_norm": 0.36134323477745056, "learning_rate": 2.5750487329434696e-05, "loss": 0.2129, "step": 18660 }, { "epoch": 2.4262508122157245, "grad_norm": 0.30541256070137024, "learning_rate": 2.5737491877842755e-05, "loss": 0.1142, "step": 18670 }, { "epoch": 2.4275503573749186, "grad_norm": 0.546960711479187, "learning_rate": 2.5724496426250814e-05, "loss": 0.0883, "step": 18680 }, { "epoch": 2.428849902534113, "grad_norm": 0.6943312287330627, "learning_rate": 2.571150097465887e-05, "loss": 0.1477, "step": 18690 }, { "epoch": 2.4301494476933074, "grad_norm": 0.7150726318359375, "learning_rate": 2.569850552306693e-05, "loss": 0.1648, "step": 18700 }, { "epoch": 2.4314489928525016, "grad_norm": 0.38841167092323303, "learning_rate": 2.5685510071474982e-05, "loss": 0.105, "step": 18710 }, { "epoch": 2.4327485380116958, "grad_norm": 0.11213887482881546, "learning_rate": 2.567251461988304e-05, "loss": 0.1371, "step": 18720 }, { "epoch": 2.4340480831708904, "grad_norm": 3.5140392780303955, "learning_rate": 2.56595191682911e-05, "loss": 0.1282, "step": 18730 }, { "epoch": 2.4353476283300846, "grad_norm": 0.5800433158874512, "learning_rate": 2.5646523716699156e-05, "loss": 0.1439, "step": 18740 }, { "epoch": 2.4366471734892787, "grad_norm": 0.7097199559211731, "learning_rate": 2.5633528265107216e-05, "loss": 0.1214, "step": 18750 }, { "epoch": 2.437946718648473, "grad_norm": 1.1131335496902466, "learning_rate": 2.5620532813515268e-05, "loss": 0.2648, "step": 18760 }, { "epoch": 2.439246263807667, "grad_norm": 0.4857313930988312, "learning_rate": 2.5607537361923327e-05, "loss": 0.1417, "step": 18770 }, { "epoch": 2.4405458089668617, "grad_norm": 0.3993171155452728, "learning_rate": 2.5594541910331387e-05, "loss": 0.1249, "step": 18780 }, { "epoch": 2.441845354126056, "grad_norm": 0.15487544238567352, "learning_rate": 2.5581546458739442e-05, "loss": 0.1954, "step": 18790 }, { "epoch": 2.44314489928525, "grad_norm": 0.31791141629219055, "learning_rate": 2.55685510071475e-05, "loss": 0.1335, "step": 18800 }, { "epoch": 2.4444444444444446, "grad_norm": 0.2591530680656433, "learning_rate": 2.5555555555555554e-05, "loss": 0.1096, "step": 18810 }, { "epoch": 2.445743989603639, "grad_norm": 0.18949678540229797, "learning_rate": 2.5542560103963613e-05, "loss": 0.1368, "step": 18820 }, { "epoch": 2.447043534762833, "grad_norm": 0.22234410047531128, "learning_rate": 2.5529564652371673e-05, "loss": 0.1643, "step": 18830 }, { "epoch": 2.448343079922027, "grad_norm": 0.9218502044677734, "learning_rate": 2.551656920077973e-05, "loss": 0.1779, "step": 18840 }, { "epoch": 2.4496426250812213, "grad_norm": 0.2415187656879425, "learning_rate": 2.5503573749187788e-05, "loss": 0.1683, "step": 18850 }, { "epoch": 2.450942170240416, "grad_norm": 0.3158169090747833, "learning_rate": 2.549057829759584e-05, "loss": 0.2038, "step": 18860 }, { "epoch": 2.45224171539961, "grad_norm": 0.12446051836013794, "learning_rate": 2.54775828460039e-05, "loss": 0.099, "step": 18870 }, { "epoch": 2.4535412605588043, "grad_norm": 0.36569100618362427, "learning_rate": 2.546458739441196e-05, "loss": 0.1452, "step": 18880 }, { "epoch": 2.454840805717999, "grad_norm": 0.5500723719596863, "learning_rate": 2.5451591942820014e-05, "loss": 0.1748, "step": 18890 }, { "epoch": 2.456140350877193, "grad_norm": 0.1513514369726181, "learning_rate": 2.5438596491228074e-05, "loss": 0.2867, "step": 18900 }, { "epoch": 2.4574398960363872, "grad_norm": 0.46075040102005005, "learning_rate": 2.5425601039636126e-05, "loss": 0.1113, "step": 18910 }, { "epoch": 2.4587394411955814, "grad_norm": 0.6624956130981445, "learning_rate": 2.5412605588044185e-05, "loss": 0.1851, "step": 18920 }, { "epoch": 2.4600389863547756, "grad_norm": 0.5960378050804138, "learning_rate": 2.5399610136452245e-05, "loss": 0.1553, "step": 18930 }, { "epoch": 2.46133853151397, "grad_norm": 0.2263716608285904, "learning_rate": 2.53866146848603e-05, "loss": 0.1448, "step": 18940 }, { "epoch": 2.4626380766731644, "grad_norm": 0.24058207869529724, "learning_rate": 2.537361923326836e-05, "loss": 0.1876, "step": 18950 }, { "epoch": 2.4639376218323585, "grad_norm": 0.31973797082901, "learning_rate": 2.5360623781676412e-05, "loss": 0.2004, "step": 18960 }, { "epoch": 2.465237166991553, "grad_norm": 0.7586785554885864, "learning_rate": 2.534762833008447e-05, "loss": 0.1588, "step": 18970 }, { "epoch": 2.4665367121507473, "grad_norm": 0.5545956492424011, "learning_rate": 2.533463287849253e-05, "loss": 0.1582, "step": 18980 }, { "epoch": 2.4678362573099415, "grad_norm": 0.8825656175613403, "learning_rate": 2.5321637426900587e-05, "loss": 0.1448, "step": 18990 }, { "epoch": 2.4691358024691357, "grad_norm": 0.234712615609169, "learning_rate": 2.5308641975308646e-05, "loss": 0.1524, "step": 19000 }, { "epoch": 2.47043534762833, "grad_norm": 0.9787898063659668, "learning_rate": 2.5295646523716698e-05, "loss": 0.1689, "step": 19010 }, { "epoch": 2.4717348927875245, "grad_norm": 0.8227027058601379, "learning_rate": 2.5282651072124758e-05, "loss": 0.2015, "step": 19020 }, { "epoch": 2.4730344379467186, "grad_norm": 0.8828518390655518, "learning_rate": 2.5269655620532817e-05, "loss": 0.1715, "step": 19030 }, { "epoch": 2.474333983105913, "grad_norm": 1.0078274011611938, "learning_rate": 2.5256660168940873e-05, "loss": 0.148, "step": 19040 }, { "epoch": 2.4756335282651074, "grad_norm": 0.43748193979263306, "learning_rate": 2.5243664717348932e-05, "loss": 0.1995, "step": 19050 }, { "epoch": 2.4769330734243016, "grad_norm": 0.43961814045906067, "learning_rate": 2.5230669265756984e-05, "loss": 0.1833, "step": 19060 }, { "epoch": 2.4782326185834957, "grad_norm": 0.441164493560791, "learning_rate": 2.5217673814165044e-05, "loss": 0.1167, "step": 19070 }, { "epoch": 2.47953216374269, "grad_norm": 1.4048677682876587, "learning_rate": 2.5204678362573103e-05, "loss": 0.1245, "step": 19080 }, { "epoch": 2.480831708901884, "grad_norm": 0.5088582634925842, "learning_rate": 2.5191682910981155e-05, "loss": 0.1956, "step": 19090 }, { "epoch": 2.4821312540610787, "grad_norm": 0.5660173296928406, "learning_rate": 2.5178687459389218e-05, "loss": 0.1546, "step": 19100 }, { "epoch": 2.483430799220273, "grad_norm": 0.6997310519218445, "learning_rate": 2.516569200779727e-05, "loss": 0.0983, "step": 19110 }, { "epoch": 2.484730344379467, "grad_norm": 0.4885684549808502, "learning_rate": 2.515269655620533e-05, "loss": 0.1242, "step": 19120 }, { "epoch": 2.4860298895386617, "grad_norm": 0.5997287631034851, "learning_rate": 2.513970110461339e-05, "loss": 0.1547, "step": 19130 }, { "epoch": 2.487329434697856, "grad_norm": 1.1764702796936035, "learning_rate": 2.512670565302144e-05, "loss": 0.2002, "step": 19140 }, { "epoch": 2.48862897985705, "grad_norm": 0.6058632135391235, "learning_rate": 2.51137102014295e-05, "loss": 0.1625, "step": 19150 }, { "epoch": 2.489928525016244, "grad_norm": 0.4218636751174927, "learning_rate": 2.5100714749837556e-05, "loss": 0.1666, "step": 19160 }, { "epoch": 2.4912280701754383, "grad_norm": 0.486470103263855, "learning_rate": 2.5087719298245616e-05, "loss": 0.1906, "step": 19170 }, { "epoch": 2.492527615334633, "grad_norm": 0.2208511382341385, "learning_rate": 2.5074723846653675e-05, "loss": 0.137, "step": 19180 }, { "epoch": 2.493827160493827, "grad_norm": 0.3279654383659363, "learning_rate": 2.5061728395061727e-05, "loss": 0.1414, "step": 19190 }, { "epoch": 2.4951267056530213, "grad_norm": 0.5431009531021118, "learning_rate": 2.5048732943469787e-05, "loss": 0.113, "step": 19200 }, { "epoch": 2.496426250812216, "grad_norm": 0.9703484773635864, "learning_rate": 2.5035737491877843e-05, "loss": 0.1947, "step": 19210 }, { "epoch": 2.49772579597141, "grad_norm": 0.12505382299423218, "learning_rate": 2.5022742040285902e-05, "loss": 0.2074, "step": 19220 }, { "epoch": 2.4990253411306043, "grad_norm": 0.28302034735679626, "learning_rate": 2.500974658869396e-05, "loss": 0.1981, "step": 19230 }, { "epoch": 2.5003248862897984, "grad_norm": 0.36536115407943726, "learning_rate": 2.4996751137102013e-05, "loss": 0.2389, "step": 19240 }, { "epoch": 2.5016244314489926, "grad_norm": 0.2638300955295563, "learning_rate": 2.4983755685510073e-05, "loss": 0.1057, "step": 19250 }, { "epoch": 2.502923976608187, "grad_norm": 0.43846794962882996, "learning_rate": 2.4970760233918132e-05, "loss": 0.1998, "step": 19260 }, { "epoch": 2.5042235217673814, "grad_norm": 0.5244620442390442, "learning_rate": 2.4957764782326188e-05, "loss": 0.1549, "step": 19270 }, { "epoch": 2.5055230669265756, "grad_norm": 0.4409860670566559, "learning_rate": 2.4944769330734244e-05, "loss": 0.1218, "step": 19280 }, { "epoch": 2.50682261208577, "grad_norm": 0.4597545862197876, "learning_rate": 2.49317738791423e-05, "loss": 0.1098, "step": 19290 }, { "epoch": 2.5081221572449643, "grad_norm": 0.3947272002696991, "learning_rate": 2.491877842755036e-05, "loss": 0.132, "step": 19300 }, { "epoch": 2.5094217024041585, "grad_norm": 0.6326673626899719, "learning_rate": 2.4905782975958418e-05, "loss": 0.1227, "step": 19310 }, { "epoch": 2.5107212475633527, "grad_norm": 0.6282929182052612, "learning_rate": 2.4892787524366474e-05, "loss": 0.1509, "step": 19320 }, { "epoch": 2.512020792722547, "grad_norm": 0.23980650305747986, "learning_rate": 2.487979207277453e-05, "loss": 0.1262, "step": 19330 }, { "epoch": 2.5133203378817415, "grad_norm": 0.327137291431427, "learning_rate": 2.4866796621182586e-05, "loss": 0.1329, "step": 19340 }, { "epoch": 2.5146198830409356, "grad_norm": 0.41675111651420593, "learning_rate": 2.485380116959064e-05, "loss": 0.1873, "step": 19350 }, { "epoch": 2.51591942820013, "grad_norm": 0.8738132119178772, "learning_rate": 2.4840805717998704e-05, "loss": 0.131, "step": 19360 }, { "epoch": 2.5172189733593244, "grad_norm": 0.5346332788467407, "learning_rate": 2.482781026640676e-05, "loss": 0.1308, "step": 19370 }, { "epoch": 2.5185185185185186, "grad_norm": 0.43487274646759033, "learning_rate": 2.4814814814814816e-05, "loss": 0.1686, "step": 19380 }, { "epoch": 2.5198180636777128, "grad_norm": 1.271761417388916, "learning_rate": 2.480181936322287e-05, "loss": 0.1483, "step": 19390 }, { "epoch": 2.521117608836907, "grad_norm": 0.8204700350761414, "learning_rate": 2.4788823911630927e-05, "loss": 0.1527, "step": 19400 }, { "epoch": 2.522417153996101, "grad_norm": 0.5451626181602478, "learning_rate": 2.4775828460038987e-05, "loss": 0.1251, "step": 19410 }, { "epoch": 2.5237166991552957, "grad_norm": 0.6228656768798828, "learning_rate": 2.4762833008447046e-05, "loss": 0.1535, "step": 19420 }, { "epoch": 2.52501624431449, "grad_norm": 0.8089338541030884, "learning_rate": 2.4749837556855102e-05, "loss": 0.1192, "step": 19430 }, { "epoch": 2.526315789473684, "grad_norm": 0.16217423975467682, "learning_rate": 2.4736842105263158e-05, "loss": 0.1166, "step": 19440 }, { "epoch": 2.5276153346328787, "grad_norm": 0.2706741690635681, "learning_rate": 2.4723846653671214e-05, "loss": 0.1169, "step": 19450 }, { "epoch": 2.528914879792073, "grad_norm": 0.5131664872169495, "learning_rate": 2.4710851202079273e-05, "loss": 0.1327, "step": 19460 }, { "epoch": 2.530214424951267, "grad_norm": 0.5002783536911011, "learning_rate": 2.4697855750487332e-05, "loss": 0.1154, "step": 19470 }, { "epoch": 2.531513970110461, "grad_norm": 1.0342577695846558, "learning_rate": 2.4684860298895388e-05, "loss": 0.165, "step": 19480 }, { "epoch": 2.5328135152696554, "grad_norm": 0.21879065036773682, "learning_rate": 2.4671864847303444e-05, "loss": 0.0866, "step": 19490 }, { "epoch": 2.53411306042885, "grad_norm": 0.5572959184646606, "learning_rate": 2.46588693957115e-05, "loss": 0.1218, "step": 19500 }, { "epoch": 2.535412605588044, "grad_norm": 0.35132187604904175, "learning_rate": 2.464587394411956e-05, "loss": 0.1527, "step": 19510 }, { "epoch": 2.5367121507472383, "grad_norm": 0.2599393427371979, "learning_rate": 2.4632878492527618e-05, "loss": 0.1503, "step": 19520 }, { "epoch": 2.538011695906433, "grad_norm": 0.19935539364814758, "learning_rate": 2.4619883040935674e-05, "loss": 0.1642, "step": 19530 }, { "epoch": 2.539311241065627, "grad_norm": 1.3532756567001343, "learning_rate": 2.460688758934373e-05, "loss": 0.1537, "step": 19540 }, { "epoch": 2.5406107862248213, "grad_norm": 0.11133318394422531, "learning_rate": 2.4593892137751786e-05, "loss": 0.2171, "step": 19550 }, { "epoch": 2.5419103313840155, "grad_norm": 0.8491989970207214, "learning_rate": 2.4580896686159845e-05, "loss": 0.1784, "step": 19560 }, { "epoch": 2.5432098765432096, "grad_norm": 1.0162081718444824, "learning_rate": 2.4567901234567904e-05, "loss": 0.1512, "step": 19570 }, { "epoch": 2.5445094217024042, "grad_norm": 0.27939170598983765, "learning_rate": 2.455490578297596e-05, "loss": 0.1467, "step": 19580 }, { "epoch": 2.5458089668615984, "grad_norm": 0.38138824701309204, "learning_rate": 2.4541910331384016e-05, "loss": 0.1128, "step": 19590 }, { "epoch": 2.5471085120207926, "grad_norm": 0.5179502964019775, "learning_rate": 2.4528914879792072e-05, "loss": 0.1782, "step": 19600 }, { "epoch": 2.548408057179987, "grad_norm": 0.2533377707004547, "learning_rate": 2.451591942820013e-05, "loss": 0.1112, "step": 19610 }, { "epoch": 2.5497076023391814, "grad_norm": 0.2635590136051178, "learning_rate": 2.450292397660819e-05, "loss": 0.1306, "step": 19620 }, { "epoch": 2.5510071474983755, "grad_norm": 0.3950013816356659, "learning_rate": 2.4489928525016246e-05, "loss": 0.1695, "step": 19630 }, { "epoch": 2.5523066926575697, "grad_norm": 1.1153885126113892, "learning_rate": 2.4476933073424302e-05, "loss": 0.1143, "step": 19640 }, { "epoch": 2.553606237816764, "grad_norm": 0.4041256308555603, "learning_rate": 2.4463937621832358e-05, "loss": 0.1347, "step": 19650 }, { "epoch": 2.5549057829759585, "grad_norm": 0.7505330443382263, "learning_rate": 2.4450942170240417e-05, "loss": 0.1087, "step": 19660 }, { "epoch": 2.5562053281351527, "grad_norm": 0.5803844928741455, "learning_rate": 2.4437946718648473e-05, "loss": 0.1291, "step": 19670 }, { "epoch": 2.557504873294347, "grad_norm": 0.2581333518028259, "learning_rate": 2.4424951267056532e-05, "loss": 0.1479, "step": 19680 }, { "epoch": 2.5588044184535415, "grad_norm": 0.5638561844825745, "learning_rate": 2.4411955815464588e-05, "loss": 0.105, "step": 19690 }, { "epoch": 2.5601039636127356, "grad_norm": 0.18738944828510284, "learning_rate": 2.4398960363872644e-05, "loss": 0.1027, "step": 19700 }, { "epoch": 2.56140350877193, "grad_norm": 0.3440243899822235, "learning_rate": 2.4385964912280703e-05, "loss": 0.1953, "step": 19710 }, { "epoch": 2.562703053931124, "grad_norm": 0.297562837600708, "learning_rate": 2.437296946068876e-05, "loss": 0.0917, "step": 19720 }, { "epoch": 2.564002599090318, "grad_norm": 0.4930593967437744, "learning_rate": 2.4359974009096818e-05, "loss": 0.1615, "step": 19730 }, { "epoch": 2.5653021442495128, "grad_norm": 1.0725723505020142, "learning_rate": 2.4346978557504874e-05, "loss": 0.1855, "step": 19740 }, { "epoch": 2.566601689408707, "grad_norm": 0.5474525690078735, "learning_rate": 2.433398310591293e-05, "loss": 0.1634, "step": 19750 }, { "epoch": 2.567901234567901, "grad_norm": 0.88482266664505, "learning_rate": 2.432098765432099e-05, "loss": 0.2005, "step": 19760 }, { "epoch": 2.5692007797270957, "grad_norm": 0.1466059386730194, "learning_rate": 2.4307992202729045e-05, "loss": 0.1728, "step": 19770 }, { "epoch": 2.57050032488629, "grad_norm": 0.22036239504814148, "learning_rate": 2.4294996751137104e-05, "loss": 0.13, "step": 19780 }, { "epoch": 2.571799870045484, "grad_norm": 1.6013559103012085, "learning_rate": 2.428200129954516e-05, "loss": 0.1909, "step": 19790 }, { "epoch": 2.573099415204678, "grad_norm": 0.36932769417762756, "learning_rate": 2.4269005847953216e-05, "loss": 0.1223, "step": 19800 }, { "epoch": 2.5743989603638724, "grad_norm": 0.23589780926704407, "learning_rate": 2.4256010396361275e-05, "loss": 0.1199, "step": 19810 }, { "epoch": 2.575698505523067, "grad_norm": 0.8034117221832275, "learning_rate": 2.424301494476933e-05, "loss": 0.1496, "step": 19820 }, { "epoch": 2.576998050682261, "grad_norm": 0.4772292971611023, "learning_rate": 2.423001949317739e-05, "loss": 0.0845, "step": 19830 }, { "epoch": 2.5782975958414553, "grad_norm": 0.6734396815299988, "learning_rate": 2.4217024041585446e-05, "loss": 0.1323, "step": 19840 }, { "epoch": 2.57959714100065, "grad_norm": 0.24076104164123535, "learning_rate": 2.4204028589993502e-05, "loss": 0.1452, "step": 19850 }, { "epoch": 2.580896686159844, "grad_norm": 1.0895379781723022, "learning_rate": 2.419103313840156e-05, "loss": 0.1776, "step": 19860 }, { "epoch": 2.5821962313190383, "grad_norm": 0.5034093856811523, "learning_rate": 2.4178037686809617e-05, "loss": 0.1313, "step": 19870 }, { "epoch": 2.5834957764782325, "grad_norm": 0.5133626461029053, "learning_rate": 2.4165042235217676e-05, "loss": 0.1411, "step": 19880 }, { "epoch": 2.5847953216374266, "grad_norm": 0.6634498238563538, "learning_rate": 2.4152046783625732e-05, "loss": 0.11, "step": 19890 }, { "epoch": 2.5860948667966213, "grad_norm": 0.7515686750411987, "learning_rate": 2.4139051332033788e-05, "loss": 0.1281, "step": 19900 }, { "epoch": 2.5873944119558154, "grad_norm": 0.19388560950756073, "learning_rate": 2.4126055880441847e-05, "loss": 0.1266, "step": 19910 }, { "epoch": 2.5886939571150096, "grad_norm": 1.0047085285186768, "learning_rate": 2.4113060428849903e-05, "loss": 0.1608, "step": 19920 }, { "epoch": 2.589993502274204, "grad_norm": 0.51969313621521, "learning_rate": 2.410006497725796e-05, "loss": 0.2167, "step": 19930 }, { "epoch": 2.5912930474333984, "grad_norm": 0.5190099477767944, "learning_rate": 2.4087069525666018e-05, "loss": 0.1373, "step": 19940 }, { "epoch": 2.5925925925925926, "grad_norm": 0.38000208139419556, "learning_rate": 2.4074074074074074e-05, "loss": 0.1337, "step": 19950 }, { "epoch": 2.5938921377517867, "grad_norm": 0.35794034600257874, "learning_rate": 2.4061078622482133e-05, "loss": 0.1138, "step": 19960 }, { "epoch": 2.595191682910981, "grad_norm": 0.2946215867996216, "learning_rate": 2.404808317089019e-05, "loss": 0.1176, "step": 19970 }, { "epoch": 2.5964912280701755, "grad_norm": 0.2818569242954254, "learning_rate": 2.4035087719298245e-05, "loss": 0.108, "step": 19980 }, { "epoch": 2.5977907732293697, "grad_norm": 0.49106791615486145, "learning_rate": 2.4022092267706304e-05, "loss": 0.1349, "step": 19990 }, { "epoch": 2.599090318388564, "grad_norm": 0.30277034640312195, "learning_rate": 2.400909681611436e-05, "loss": 0.1475, "step": 20000 }, { "epoch": 2.6003898635477585, "grad_norm": 1.096961498260498, "learning_rate": 2.399610136452242e-05, "loss": 0.1477, "step": 20010 }, { "epoch": 2.6016894087069526, "grad_norm": 1.063625693321228, "learning_rate": 2.3983105912930475e-05, "loss": 0.278, "step": 20020 }, { "epoch": 2.602988953866147, "grad_norm": 0.3423335552215576, "learning_rate": 2.397011046133853e-05, "loss": 0.1548, "step": 20030 }, { "epoch": 2.604288499025341, "grad_norm": 0.24778339266777039, "learning_rate": 2.395711500974659e-05, "loss": 0.1834, "step": 20040 }, { "epoch": 2.605588044184535, "grad_norm": 0.1404009461402893, "learning_rate": 2.3944119558154646e-05, "loss": 0.1056, "step": 20050 }, { "epoch": 2.6068875893437298, "grad_norm": 0.23776105046272278, "learning_rate": 2.3931124106562705e-05, "loss": 0.1436, "step": 20060 }, { "epoch": 2.608187134502924, "grad_norm": 0.49798938632011414, "learning_rate": 2.391812865497076e-05, "loss": 0.185, "step": 20070 }, { "epoch": 2.609486679662118, "grad_norm": 0.13453899323940277, "learning_rate": 2.3905133203378817e-05, "loss": 0.1593, "step": 20080 }, { "epoch": 2.6107862248213127, "grad_norm": 0.44448012113571167, "learning_rate": 2.3892137751786876e-05, "loss": 0.168, "step": 20090 }, { "epoch": 2.612085769980507, "grad_norm": 0.48921433091163635, "learning_rate": 2.3879142300194932e-05, "loss": 0.223, "step": 20100 }, { "epoch": 2.613385315139701, "grad_norm": 1.11602783203125, "learning_rate": 2.386614684860299e-05, "loss": 0.1234, "step": 20110 }, { "epoch": 2.6146848602988952, "grad_norm": 0.419695645570755, "learning_rate": 2.3853151397011047e-05, "loss": 0.0951, "step": 20120 }, { "epoch": 2.6159844054580894, "grad_norm": 0.2294868379831314, "learning_rate": 2.3840155945419103e-05, "loss": 0.1851, "step": 20130 }, { "epoch": 2.617283950617284, "grad_norm": 0.5915423631668091, "learning_rate": 2.3827160493827162e-05, "loss": 0.1388, "step": 20140 }, { "epoch": 2.618583495776478, "grad_norm": 0.3141067922115326, "learning_rate": 2.381416504223522e-05, "loss": 0.1144, "step": 20150 }, { "epoch": 2.6198830409356724, "grad_norm": 0.40129292011260986, "learning_rate": 2.3801169590643278e-05, "loss": 0.1021, "step": 20160 }, { "epoch": 2.621182586094867, "grad_norm": 0.3467646837234497, "learning_rate": 2.3788174139051333e-05, "loss": 0.1281, "step": 20170 }, { "epoch": 2.622482131254061, "grad_norm": 0.4489988684654236, "learning_rate": 2.377517868745939e-05, "loss": 0.1639, "step": 20180 }, { "epoch": 2.6237816764132553, "grad_norm": 0.16052848100662231, "learning_rate": 2.3762183235867445e-05, "loss": 0.1054, "step": 20190 }, { "epoch": 2.6250812215724495, "grad_norm": 1.0275284051895142, "learning_rate": 2.3749187784275504e-05, "loss": 0.2165, "step": 20200 }, { "epoch": 2.6263807667316437, "grad_norm": 0.3942038118839264, "learning_rate": 2.3736192332683564e-05, "loss": 0.2004, "step": 20210 }, { "epoch": 2.6276803118908383, "grad_norm": 0.4406172037124634, "learning_rate": 2.372319688109162e-05, "loss": 0.1237, "step": 20220 }, { "epoch": 2.6289798570500325, "grad_norm": 0.4239098131656647, "learning_rate": 2.3710201429499675e-05, "loss": 0.1273, "step": 20230 }, { "epoch": 2.6302794022092266, "grad_norm": 0.6475558280944824, "learning_rate": 2.369720597790773e-05, "loss": 0.1719, "step": 20240 }, { "epoch": 2.6315789473684212, "grad_norm": 0.2381664663553238, "learning_rate": 2.368421052631579e-05, "loss": 0.1229, "step": 20250 }, { "epoch": 2.6328784925276154, "grad_norm": 0.665493369102478, "learning_rate": 2.367121507472385e-05, "loss": 0.1547, "step": 20260 }, { "epoch": 2.6341780376868096, "grad_norm": 0.3422938585281372, "learning_rate": 2.3658219623131906e-05, "loss": 0.1096, "step": 20270 }, { "epoch": 2.6354775828460038, "grad_norm": 0.13189047574996948, "learning_rate": 2.364522417153996e-05, "loss": 0.108, "step": 20280 }, { "epoch": 2.636777128005198, "grad_norm": 0.38557443022727966, "learning_rate": 2.3632228719948017e-05, "loss": 0.1434, "step": 20290 }, { "epoch": 2.6380766731643925, "grad_norm": 0.6479376554489136, "learning_rate": 2.3619233268356076e-05, "loss": 0.1206, "step": 20300 }, { "epoch": 2.6393762183235867, "grad_norm": 1.023474931716919, "learning_rate": 2.3606237816764136e-05, "loss": 0.1452, "step": 20310 }, { "epoch": 2.640675763482781, "grad_norm": 0.7558814287185669, "learning_rate": 2.359324236517219e-05, "loss": 0.1087, "step": 20320 }, { "epoch": 2.6419753086419755, "grad_norm": 0.501065194606781, "learning_rate": 2.3580246913580247e-05, "loss": 0.1362, "step": 20330 }, { "epoch": 2.6432748538011697, "grad_norm": 0.5751064419746399, "learning_rate": 2.3567251461988303e-05, "loss": 0.1121, "step": 20340 }, { "epoch": 2.644574398960364, "grad_norm": 0.9591578245162964, "learning_rate": 2.3554256010396363e-05, "loss": 0.1487, "step": 20350 }, { "epoch": 2.645873944119558, "grad_norm": 0.38619494438171387, "learning_rate": 2.3541260558804422e-05, "loss": 0.2406, "step": 20360 }, { "epoch": 2.647173489278752, "grad_norm": 0.2926020324230194, "learning_rate": 2.3528265107212478e-05, "loss": 0.1791, "step": 20370 }, { "epoch": 2.648473034437947, "grad_norm": 0.23321539163589478, "learning_rate": 2.3515269655620533e-05, "loss": 0.097, "step": 20380 }, { "epoch": 2.649772579597141, "grad_norm": 0.847718358039856, "learning_rate": 2.350227420402859e-05, "loss": 0.1576, "step": 20390 }, { "epoch": 2.651072124756335, "grad_norm": 0.29897409677505493, "learning_rate": 2.348927875243665e-05, "loss": 0.0766, "step": 20400 }, { "epoch": 2.6523716699155298, "grad_norm": 0.3380504548549652, "learning_rate": 2.3476283300844708e-05, "loss": 0.1416, "step": 20410 }, { "epoch": 2.653671215074724, "grad_norm": 0.5399699807167053, "learning_rate": 2.3463287849252764e-05, "loss": 0.1316, "step": 20420 }, { "epoch": 2.654970760233918, "grad_norm": 0.5408064723014832, "learning_rate": 2.345029239766082e-05, "loss": 0.1464, "step": 20430 }, { "epoch": 2.6562703053931123, "grad_norm": 0.2081882655620575, "learning_rate": 2.3437296946068875e-05, "loss": 0.1207, "step": 20440 }, { "epoch": 2.6575698505523064, "grad_norm": 0.42002540826797485, "learning_rate": 2.342430149447693e-05, "loss": 0.2212, "step": 20450 }, { "epoch": 2.658869395711501, "grad_norm": 0.2997036576271057, "learning_rate": 2.3411306042884994e-05, "loss": 0.1382, "step": 20460 }, { "epoch": 2.660168940870695, "grad_norm": 0.4733165204524994, "learning_rate": 2.339831059129305e-05, "loss": 0.1621, "step": 20470 }, { "epoch": 2.6614684860298894, "grad_norm": 0.23766589164733887, "learning_rate": 2.3385315139701106e-05, "loss": 0.1763, "step": 20480 }, { "epoch": 2.662768031189084, "grad_norm": 0.2297731339931488, "learning_rate": 2.337231968810916e-05, "loss": 0.1242, "step": 20490 }, { "epoch": 2.664067576348278, "grad_norm": 0.3401143252849579, "learning_rate": 2.3359324236517217e-05, "loss": 0.133, "step": 20500 }, { "epoch": 2.6653671215074723, "grad_norm": 1.1285641193389893, "learning_rate": 2.3346328784925277e-05, "loss": 0.1746, "step": 20510 }, { "epoch": 2.6666666666666665, "grad_norm": 0.42069870233535767, "learning_rate": 2.3333333333333336e-05, "loss": 0.1837, "step": 20520 }, { "epoch": 2.6679662118258607, "grad_norm": 0.15359477698802948, "learning_rate": 2.332033788174139e-05, "loss": 0.1421, "step": 20530 }, { "epoch": 2.6692657569850553, "grad_norm": 0.38622546195983887, "learning_rate": 2.3307342430149448e-05, "loss": 0.0862, "step": 20540 }, { "epoch": 2.6705653021442495, "grad_norm": 0.6889355182647705, "learning_rate": 2.3294346978557503e-05, "loss": 0.1978, "step": 20550 }, { "epoch": 2.6718648473034436, "grad_norm": 0.5597285032272339, "learning_rate": 2.3281351526965563e-05, "loss": 0.1152, "step": 20560 }, { "epoch": 2.6731643924626383, "grad_norm": 0.2597610652446747, "learning_rate": 2.3268356075373622e-05, "loss": 0.139, "step": 20570 }, { "epoch": 2.6744639376218324, "grad_norm": 0.4383527636528015, "learning_rate": 2.3255360623781678e-05, "loss": 0.223, "step": 20580 }, { "epoch": 2.6757634827810266, "grad_norm": 0.43566668033599854, "learning_rate": 2.3242365172189734e-05, "loss": 0.1523, "step": 20590 }, { "epoch": 2.6770630279402208, "grad_norm": 0.2790084183216095, "learning_rate": 2.322936972059779e-05, "loss": 0.1093, "step": 20600 }, { "epoch": 2.678362573099415, "grad_norm": 0.8063797950744629, "learning_rate": 2.321637426900585e-05, "loss": 0.1462, "step": 20610 }, { "epoch": 2.6796621182586096, "grad_norm": 0.5538272857666016, "learning_rate": 2.3203378817413908e-05, "loss": 0.1304, "step": 20620 }, { "epoch": 2.6809616634178037, "grad_norm": 0.9138416051864624, "learning_rate": 2.3190383365821964e-05, "loss": 0.1373, "step": 20630 }, { "epoch": 2.682261208576998, "grad_norm": 0.558573305606842, "learning_rate": 2.317738791423002e-05, "loss": 0.15, "step": 20640 }, { "epoch": 2.6835607537361925, "grad_norm": 0.9636944532394409, "learning_rate": 2.3164392462638075e-05, "loss": 0.2298, "step": 20650 }, { "epoch": 2.6848602988953867, "grad_norm": 0.8129075169563293, "learning_rate": 2.3151397011046135e-05, "loss": 0.1949, "step": 20660 }, { "epoch": 2.686159844054581, "grad_norm": 0.5687807202339172, "learning_rate": 2.3138401559454194e-05, "loss": 0.1327, "step": 20670 }, { "epoch": 2.687459389213775, "grad_norm": 0.8687981367111206, "learning_rate": 2.312540610786225e-05, "loss": 0.1176, "step": 20680 }, { "epoch": 2.688758934372969, "grad_norm": 0.604373574256897, "learning_rate": 2.3112410656270306e-05, "loss": 0.1104, "step": 20690 }, { "epoch": 2.690058479532164, "grad_norm": 0.494907945394516, "learning_rate": 2.309941520467836e-05, "loss": 0.1379, "step": 20700 }, { "epoch": 2.691358024691358, "grad_norm": 0.24186457693576813, "learning_rate": 2.308641975308642e-05, "loss": 0.1582, "step": 20710 }, { "epoch": 2.692657569850552, "grad_norm": 0.4130970239639282, "learning_rate": 2.307342430149448e-05, "loss": 0.1174, "step": 20720 }, { "epoch": 2.6939571150097468, "grad_norm": 0.47432252764701843, "learning_rate": 2.3060428849902536e-05, "loss": 0.1984, "step": 20730 }, { "epoch": 2.695256660168941, "grad_norm": 0.5331589579582214, "learning_rate": 2.3047433398310592e-05, "loss": 0.1045, "step": 20740 }, { "epoch": 2.696556205328135, "grad_norm": 0.4372335970401764, "learning_rate": 2.3034437946718648e-05, "loss": 0.1231, "step": 20750 }, { "epoch": 2.6978557504873293, "grad_norm": 0.4940175414085388, "learning_rate": 2.3021442495126707e-05, "loss": 0.1449, "step": 20760 }, { "epoch": 2.6991552956465235, "grad_norm": 0.5974446535110474, "learning_rate": 2.3008447043534763e-05, "loss": 0.1608, "step": 20770 }, { "epoch": 2.700454840805718, "grad_norm": 0.4847414493560791, "learning_rate": 2.2995451591942822e-05, "loss": 0.12, "step": 20780 }, { "epoch": 2.7017543859649122, "grad_norm": 0.5523954033851624, "learning_rate": 2.2982456140350878e-05, "loss": 0.1642, "step": 20790 }, { "epoch": 2.7030539311241064, "grad_norm": 0.2120533436536789, "learning_rate": 2.2969460688758934e-05, "loss": 0.1144, "step": 20800 }, { "epoch": 2.704353476283301, "grad_norm": 0.3962641954421997, "learning_rate": 2.2956465237166993e-05, "loss": 0.1219, "step": 20810 }, { "epoch": 2.705653021442495, "grad_norm": 0.32131779193878174, "learning_rate": 2.294346978557505e-05, "loss": 0.1634, "step": 20820 }, { "epoch": 2.7069525666016894, "grad_norm": 0.560467541217804, "learning_rate": 2.2930474333983108e-05, "loss": 0.1581, "step": 20830 }, { "epoch": 2.7082521117608835, "grad_norm": 0.39057445526123047, "learning_rate": 2.2917478882391164e-05, "loss": 0.1246, "step": 20840 }, { "epoch": 2.7095516569200777, "grad_norm": 0.32567745447158813, "learning_rate": 2.290448343079922e-05, "loss": 0.1005, "step": 20850 }, { "epoch": 2.7108512020792723, "grad_norm": 0.3529159426689148, "learning_rate": 2.289148797920728e-05, "loss": 0.1367, "step": 20860 }, { "epoch": 2.7121507472384665, "grad_norm": 0.40307146310806274, "learning_rate": 2.2878492527615335e-05, "loss": 0.1733, "step": 20870 }, { "epoch": 2.7134502923976607, "grad_norm": 0.15789256989955902, "learning_rate": 2.2865497076023394e-05, "loss": 0.1805, "step": 20880 }, { "epoch": 2.7147498375568553, "grad_norm": 0.36084654927253723, "learning_rate": 2.285250162443145e-05, "loss": 0.1385, "step": 20890 }, { "epoch": 2.7160493827160495, "grad_norm": 0.24002443253993988, "learning_rate": 2.2839506172839506e-05, "loss": 0.1251, "step": 20900 }, { "epoch": 2.7173489278752436, "grad_norm": 0.7847034931182861, "learning_rate": 2.2826510721247565e-05, "loss": 0.1365, "step": 20910 }, { "epoch": 2.718648473034438, "grad_norm": 0.6085268259048462, "learning_rate": 2.281351526965562e-05, "loss": 0.1228, "step": 20920 }, { "epoch": 2.719948018193632, "grad_norm": 0.5284715890884399, "learning_rate": 2.280051981806368e-05, "loss": 0.0991, "step": 20930 }, { "epoch": 2.7212475633528266, "grad_norm": 0.42259740829467773, "learning_rate": 2.2787524366471736e-05, "loss": 0.1109, "step": 20940 }, { "epoch": 2.7225471085120208, "grad_norm": 0.3131564259529114, "learning_rate": 2.2774528914879792e-05, "loss": 0.173, "step": 20950 }, { "epoch": 2.723846653671215, "grad_norm": 0.4454595446586609, "learning_rate": 2.276153346328785e-05, "loss": 0.1402, "step": 20960 }, { "epoch": 2.7251461988304095, "grad_norm": 0.5619806051254272, "learning_rate": 2.2748538011695907e-05, "loss": 0.1717, "step": 20970 }, { "epoch": 2.7264457439896037, "grad_norm": 0.42630109190940857, "learning_rate": 2.2735542560103966e-05, "loss": 0.1641, "step": 20980 }, { "epoch": 2.727745289148798, "grad_norm": 0.35065758228302, "learning_rate": 2.2722547108512022e-05, "loss": 0.1421, "step": 20990 }, { "epoch": 2.729044834307992, "grad_norm": 0.5470367074012756, "learning_rate": 2.2709551656920078e-05, "loss": 0.1639, "step": 21000 }, { "epoch": 2.7303443794671862, "grad_norm": 0.4011901021003723, "learning_rate": 2.2696556205328137e-05, "loss": 0.1135, "step": 21010 }, { "epoch": 2.731643924626381, "grad_norm": 0.8339711427688599, "learning_rate": 2.2683560753736193e-05, "loss": 0.1358, "step": 21020 }, { "epoch": 2.732943469785575, "grad_norm": 1.0654600858688354, "learning_rate": 2.267056530214425e-05, "loss": 0.1244, "step": 21030 }, { "epoch": 2.734243014944769, "grad_norm": 0.46190890669822693, "learning_rate": 2.2657569850552308e-05, "loss": 0.1364, "step": 21040 }, { "epoch": 2.735542560103964, "grad_norm": 0.5320155620574951, "learning_rate": 2.2644574398960364e-05, "loss": 0.1006, "step": 21050 }, { "epoch": 2.736842105263158, "grad_norm": 1.0644174814224243, "learning_rate": 2.2631578947368423e-05, "loss": 0.1925, "step": 21060 }, { "epoch": 2.738141650422352, "grad_norm": 0.5617789030075073, "learning_rate": 2.261858349577648e-05, "loss": 0.1431, "step": 21070 }, { "epoch": 2.7394411955815463, "grad_norm": 0.33873844146728516, "learning_rate": 2.2605588044184535e-05, "loss": 0.1484, "step": 21080 }, { "epoch": 2.7407407407407405, "grad_norm": 0.5458827614784241, "learning_rate": 2.2592592592592594e-05, "loss": 0.1707, "step": 21090 }, { "epoch": 2.742040285899935, "grad_norm": 0.310551255941391, "learning_rate": 2.257959714100065e-05, "loss": 0.1631, "step": 21100 }, { "epoch": 2.7433398310591293, "grad_norm": 0.9030607342720032, "learning_rate": 2.256660168940871e-05, "loss": 0.1337, "step": 21110 }, { "epoch": 2.7446393762183234, "grad_norm": 0.3314349949359894, "learning_rate": 2.2553606237816765e-05, "loss": 0.101, "step": 21120 }, { "epoch": 2.745938921377518, "grad_norm": 0.48741716146469116, "learning_rate": 2.254061078622482e-05, "loss": 0.1372, "step": 21130 }, { "epoch": 2.747238466536712, "grad_norm": 0.3199063837528229, "learning_rate": 2.252761533463288e-05, "loss": 0.1129, "step": 21140 }, { "epoch": 2.7485380116959064, "grad_norm": 0.553358256816864, "learning_rate": 2.2514619883040936e-05, "loss": 0.1173, "step": 21150 }, { "epoch": 2.7498375568551006, "grad_norm": 1.0582489967346191, "learning_rate": 2.2501624431448995e-05, "loss": 0.2366, "step": 21160 }, { "epoch": 2.7511371020142947, "grad_norm": 0.7803159952163696, "learning_rate": 2.248862897985705e-05, "loss": 0.1622, "step": 21170 }, { "epoch": 2.7524366471734893, "grad_norm": 0.5033148527145386, "learning_rate": 2.2475633528265107e-05, "loss": 0.1191, "step": 21180 }, { "epoch": 2.7537361923326835, "grad_norm": 0.2899206280708313, "learning_rate": 2.2462638076673166e-05, "loss": 0.17, "step": 21190 }, { "epoch": 2.7550357374918777, "grad_norm": 1.0668151378631592, "learning_rate": 2.2449642625081222e-05, "loss": 0.2012, "step": 21200 }, { "epoch": 2.7563352826510723, "grad_norm": 0.6388433575630188, "learning_rate": 2.243664717348928e-05, "loss": 0.1558, "step": 21210 }, { "epoch": 2.7576348278102665, "grad_norm": 0.47659093141555786, "learning_rate": 2.2423651721897337e-05, "loss": 0.1945, "step": 21220 }, { "epoch": 2.7589343729694606, "grad_norm": 0.3770115375518799, "learning_rate": 2.2410656270305393e-05, "loss": 0.1558, "step": 21230 }, { "epoch": 2.760233918128655, "grad_norm": 1.0048714876174927, "learning_rate": 2.2397660818713452e-05, "loss": 0.2444, "step": 21240 }, { "epoch": 2.761533463287849, "grad_norm": 0.5060768127441406, "learning_rate": 2.2384665367121508e-05, "loss": 0.0854, "step": 21250 }, { "epoch": 2.7628330084470436, "grad_norm": 0.3534630537033081, "learning_rate": 2.2371669915529567e-05, "loss": 0.1375, "step": 21260 }, { "epoch": 2.7641325536062378, "grad_norm": 0.7243227958679199, "learning_rate": 2.2358674463937623e-05, "loss": 0.1711, "step": 21270 }, { "epoch": 2.765432098765432, "grad_norm": 0.18496280908584595, "learning_rate": 2.234567901234568e-05, "loss": 0.1337, "step": 21280 }, { "epoch": 2.7667316439246266, "grad_norm": 0.4882853329181671, "learning_rate": 2.2332683560753735e-05, "loss": 0.1491, "step": 21290 }, { "epoch": 2.7680311890838207, "grad_norm": 1.0128425359725952, "learning_rate": 2.2319688109161794e-05, "loss": 0.1733, "step": 21300 }, { "epoch": 2.769330734243015, "grad_norm": 0.508161187171936, "learning_rate": 2.2306692657569853e-05, "loss": 0.1657, "step": 21310 }, { "epoch": 2.770630279402209, "grad_norm": 0.41281577944755554, "learning_rate": 2.229369720597791e-05, "loss": 0.1408, "step": 21320 }, { "epoch": 2.7719298245614032, "grad_norm": 0.1844843029975891, "learning_rate": 2.2280701754385965e-05, "loss": 0.1236, "step": 21330 }, { "epoch": 2.773229369720598, "grad_norm": 0.4368472993373871, "learning_rate": 2.226770630279402e-05, "loss": 0.1298, "step": 21340 }, { "epoch": 2.774528914879792, "grad_norm": 0.38329043984413147, "learning_rate": 2.225471085120208e-05, "loss": 0.1164, "step": 21350 }, { "epoch": 2.775828460038986, "grad_norm": 0.40894651412963867, "learning_rate": 2.224171539961014e-05, "loss": 0.1637, "step": 21360 }, { "epoch": 2.777128005198181, "grad_norm": 0.22685953974723816, "learning_rate": 2.2228719948018195e-05, "loss": 0.2134, "step": 21370 }, { "epoch": 2.778427550357375, "grad_norm": 0.3016496002674103, "learning_rate": 2.221572449642625e-05, "loss": 0.1664, "step": 21380 }, { "epoch": 2.779727095516569, "grad_norm": 0.38338276743888855, "learning_rate": 2.2202729044834307e-05, "loss": 0.1077, "step": 21390 }, { "epoch": 2.7810266406757633, "grad_norm": 0.42679914832115173, "learning_rate": 2.2189733593242366e-05, "loss": 0.1377, "step": 21400 }, { "epoch": 2.7823261858349575, "grad_norm": 0.47134411334991455, "learning_rate": 2.2176738141650426e-05, "loss": 0.1362, "step": 21410 }, { "epoch": 2.783625730994152, "grad_norm": 0.21092751622200012, "learning_rate": 2.216374269005848e-05, "loss": 0.2123, "step": 21420 }, { "epoch": 2.7849252761533463, "grad_norm": 0.8218830823898315, "learning_rate": 2.2150747238466537e-05, "loss": 0.1992, "step": 21430 }, { "epoch": 2.7862248213125405, "grad_norm": 0.35547712445259094, "learning_rate": 2.2137751786874593e-05, "loss": 0.1805, "step": 21440 }, { "epoch": 2.787524366471735, "grad_norm": 0.3161270320415497, "learning_rate": 2.2124756335282652e-05, "loss": 0.0686, "step": 21450 }, { "epoch": 2.7888239116309292, "grad_norm": 0.1624176800251007, "learning_rate": 2.211176088369071e-05, "loss": 0.1772, "step": 21460 }, { "epoch": 2.7901234567901234, "grad_norm": 0.45754802227020264, "learning_rate": 2.2098765432098767e-05, "loss": 0.153, "step": 21470 }, { "epoch": 2.7914230019493176, "grad_norm": 1.3837409019470215, "learning_rate": 2.2085769980506823e-05, "loss": 0.168, "step": 21480 }, { "epoch": 2.7927225471085118, "grad_norm": 0.507602334022522, "learning_rate": 2.207277452891488e-05, "loss": 0.1884, "step": 21490 }, { "epoch": 2.7940220922677064, "grad_norm": 0.187711700797081, "learning_rate": 2.205977907732294e-05, "loss": 0.1059, "step": 21500 }, { "epoch": 2.7953216374269005, "grad_norm": 0.5836420059204102, "learning_rate": 2.2046783625730998e-05, "loss": 0.1083, "step": 21510 }, { "epoch": 2.7966211825860947, "grad_norm": 0.210210382938385, "learning_rate": 2.2033788174139054e-05, "loss": 0.1327, "step": 21520 }, { "epoch": 2.7979207277452893, "grad_norm": 0.8617414832115173, "learning_rate": 2.202079272254711e-05, "loss": 0.2147, "step": 21530 }, { "epoch": 2.7992202729044835, "grad_norm": 0.5592591166496277, "learning_rate": 2.2007797270955165e-05, "loss": 0.1111, "step": 21540 }, { "epoch": 2.8005198180636777, "grad_norm": 0.359069287776947, "learning_rate": 2.199480181936322e-05, "loss": 0.1905, "step": 21550 }, { "epoch": 2.801819363222872, "grad_norm": 0.44852766394615173, "learning_rate": 2.1981806367771284e-05, "loss": 0.1445, "step": 21560 }, { "epoch": 2.803118908382066, "grad_norm": 0.4870377480983734, "learning_rate": 2.196881091617934e-05, "loss": 0.1075, "step": 21570 }, { "epoch": 2.8044184535412606, "grad_norm": 0.5046212077140808, "learning_rate": 2.1955815464587395e-05, "loss": 0.1586, "step": 21580 }, { "epoch": 2.805717998700455, "grad_norm": 0.18289828300476074, "learning_rate": 2.194282001299545e-05, "loss": 0.119, "step": 21590 }, { "epoch": 2.807017543859649, "grad_norm": 0.44257616996765137, "learning_rate": 2.1929824561403507e-05, "loss": 0.1752, "step": 21600 }, { "epoch": 2.8083170890188436, "grad_norm": 0.56949782371521, "learning_rate": 2.1916829109811566e-05, "loss": 0.1161, "step": 21610 }, { "epoch": 2.8096166341780378, "grad_norm": 0.2821151614189148, "learning_rate": 2.1903833658219626e-05, "loss": 0.1609, "step": 21620 }, { "epoch": 2.810916179337232, "grad_norm": 0.5415271520614624, "learning_rate": 2.189083820662768e-05, "loss": 0.1587, "step": 21630 }, { "epoch": 2.812215724496426, "grad_norm": 0.6645976305007935, "learning_rate": 2.1877842755035737e-05, "loss": 0.1819, "step": 21640 }, { "epoch": 2.8135152696556203, "grad_norm": 0.18120421469211578, "learning_rate": 2.1864847303443793e-05, "loss": 0.1107, "step": 21650 }, { "epoch": 2.814814814814815, "grad_norm": 0.5468969345092773, "learning_rate": 2.1851851851851852e-05, "loss": 0.1036, "step": 21660 }, { "epoch": 2.816114359974009, "grad_norm": 0.7839078307151794, "learning_rate": 2.183885640025991e-05, "loss": 0.1248, "step": 21670 }, { "epoch": 2.8174139051332032, "grad_norm": 0.509540319442749, "learning_rate": 2.1825860948667968e-05, "loss": 0.1412, "step": 21680 }, { "epoch": 2.818713450292398, "grad_norm": 0.8411994576454163, "learning_rate": 2.1812865497076023e-05, "loss": 0.1857, "step": 21690 }, { "epoch": 2.820012995451592, "grad_norm": 0.4195130467414856, "learning_rate": 2.179987004548408e-05, "loss": 0.2172, "step": 21700 }, { "epoch": 2.821312540610786, "grad_norm": 0.44781360030174255, "learning_rate": 2.178687459389214e-05, "loss": 0.1438, "step": 21710 }, { "epoch": 2.8226120857699804, "grad_norm": 0.47763702273368835, "learning_rate": 2.1773879142300198e-05, "loss": 0.1735, "step": 21720 }, { "epoch": 2.8239116309291745, "grad_norm": 0.30495285987854004, "learning_rate": 2.1760883690708254e-05, "loss": 0.1475, "step": 21730 }, { "epoch": 2.825211176088369, "grad_norm": 0.16703377664089203, "learning_rate": 2.174788823911631e-05, "loss": 0.2005, "step": 21740 }, { "epoch": 2.8265107212475633, "grad_norm": 0.40644779801368713, "learning_rate": 2.1734892787524365e-05, "loss": 0.1079, "step": 21750 }, { "epoch": 2.8278102664067575, "grad_norm": 0.13755041360855103, "learning_rate": 2.1721897335932425e-05, "loss": 0.186, "step": 21760 }, { "epoch": 2.829109811565952, "grad_norm": 0.10866597294807434, "learning_rate": 2.1708901884340484e-05, "loss": 0.1435, "step": 21770 }, { "epoch": 2.8304093567251463, "grad_norm": 1.1941004991531372, "learning_rate": 2.169590643274854e-05, "loss": 0.1567, "step": 21780 }, { "epoch": 2.8317089018843404, "grad_norm": 0.7200698852539062, "learning_rate": 2.1682910981156596e-05, "loss": 0.1183, "step": 21790 }, { "epoch": 2.8330084470435346, "grad_norm": 0.6522788405418396, "learning_rate": 2.166991552956465e-05, "loss": 0.2637, "step": 21800 }, { "epoch": 2.8343079922027288, "grad_norm": 0.3804117441177368, "learning_rate": 2.165692007797271e-05, "loss": 0.2483, "step": 21810 }, { "epoch": 2.8356075373619234, "grad_norm": 0.3000979721546173, "learning_rate": 2.164392462638077e-05, "loss": 0.2179, "step": 21820 }, { "epoch": 2.8369070825211176, "grad_norm": 0.3153296709060669, "learning_rate": 2.1630929174788826e-05, "loss": 0.1795, "step": 21830 }, { "epoch": 2.8382066276803117, "grad_norm": 0.9016162753105164, "learning_rate": 2.161793372319688e-05, "loss": 0.126, "step": 21840 }, { "epoch": 2.8395061728395063, "grad_norm": 0.8025569915771484, "learning_rate": 2.1604938271604937e-05, "loss": 0.1409, "step": 21850 }, { "epoch": 2.8408057179987005, "grad_norm": 0.7763060927391052, "learning_rate": 2.1591942820012997e-05, "loss": 0.2034, "step": 21860 }, { "epoch": 2.8421052631578947, "grad_norm": 0.13551446795463562, "learning_rate": 2.1578947368421053e-05, "loss": 0.1159, "step": 21870 }, { "epoch": 2.843404808317089, "grad_norm": 0.1643756628036499, "learning_rate": 2.1565951916829112e-05, "loss": 0.0914, "step": 21880 }, { "epoch": 2.844704353476283, "grad_norm": 0.3890928328037262, "learning_rate": 2.1552956465237168e-05, "loss": 0.1152, "step": 21890 }, { "epoch": 2.8460038986354776, "grad_norm": 1.6052559614181519, "learning_rate": 2.1539961013645223e-05, "loss": 0.2419, "step": 21900 }, { "epoch": 2.847303443794672, "grad_norm": 0.579932689666748, "learning_rate": 2.1526965562053283e-05, "loss": 0.1179, "step": 21910 }, { "epoch": 2.848602988953866, "grad_norm": 0.354056715965271, "learning_rate": 2.151397011046134e-05, "loss": 0.1155, "step": 21920 }, { "epoch": 2.8499025341130606, "grad_norm": 0.8322489857673645, "learning_rate": 2.1500974658869398e-05, "loss": 0.1594, "step": 21930 }, { "epoch": 2.8512020792722548, "grad_norm": 0.9493608474731445, "learning_rate": 2.1487979207277454e-05, "loss": 0.1731, "step": 21940 }, { "epoch": 2.852501624431449, "grad_norm": 0.2543759047985077, "learning_rate": 2.147498375568551e-05, "loss": 0.201, "step": 21950 }, { "epoch": 2.853801169590643, "grad_norm": 0.1864827424287796, "learning_rate": 2.146198830409357e-05, "loss": 0.1195, "step": 21960 }, { "epoch": 2.8551007147498373, "grad_norm": 0.3131886422634125, "learning_rate": 2.1448992852501625e-05, "loss": 0.1289, "step": 21970 }, { "epoch": 2.856400259909032, "grad_norm": 0.787681519985199, "learning_rate": 2.1435997400909684e-05, "loss": 0.1474, "step": 21980 }, { "epoch": 2.857699805068226, "grad_norm": 0.3818584978580475, "learning_rate": 2.142300194931774e-05, "loss": 0.129, "step": 21990 }, { "epoch": 2.8589993502274202, "grad_norm": 0.5809007883071899, "learning_rate": 2.1410006497725796e-05, "loss": 0.2441, "step": 22000 }, { "epoch": 2.860298895386615, "grad_norm": 0.34792813658714294, "learning_rate": 2.1397011046133855e-05, "loss": 0.1458, "step": 22010 }, { "epoch": 2.861598440545809, "grad_norm": 0.4025619924068451, "learning_rate": 2.138401559454191e-05, "loss": 0.1508, "step": 22020 }, { "epoch": 2.862897985705003, "grad_norm": 0.24634915590286255, "learning_rate": 2.137102014294997e-05, "loss": 0.092, "step": 22030 }, { "epoch": 2.8641975308641974, "grad_norm": 0.9935775399208069, "learning_rate": 2.1358024691358026e-05, "loss": 0.1318, "step": 22040 }, { "epoch": 2.8654970760233915, "grad_norm": 0.6922814249992371, "learning_rate": 2.134502923976608e-05, "loss": 0.1742, "step": 22050 }, { "epoch": 2.866796621182586, "grad_norm": 0.4326859712600708, "learning_rate": 2.133203378817414e-05, "loss": 0.1396, "step": 22060 }, { "epoch": 2.8680961663417803, "grad_norm": 0.7429693937301636, "learning_rate": 2.1319038336582197e-05, "loss": 0.1286, "step": 22070 }, { "epoch": 2.8693957115009745, "grad_norm": 0.2910921275615692, "learning_rate": 2.1306042884990256e-05, "loss": 0.124, "step": 22080 }, { "epoch": 2.870695256660169, "grad_norm": 0.2233661711215973, "learning_rate": 2.1293047433398312e-05, "loss": 0.1213, "step": 22090 }, { "epoch": 2.8719948018193633, "grad_norm": 0.635129451751709, "learning_rate": 2.1280051981806368e-05, "loss": 0.1968, "step": 22100 }, { "epoch": 2.8732943469785575, "grad_norm": 0.48773473501205444, "learning_rate": 2.1267056530214427e-05, "loss": 0.1444, "step": 22110 }, { "epoch": 2.8745938921377516, "grad_norm": 0.4711586534976959, "learning_rate": 2.1254061078622483e-05, "loss": 0.1551, "step": 22120 }, { "epoch": 2.875893437296946, "grad_norm": 0.7784937024116516, "learning_rate": 2.124106562703054e-05, "loss": 0.1573, "step": 22130 }, { "epoch": 2.8771929824561404, "grad_norm": 0.2170625925064087, "learning_rate": 2.1228070175438598e-05, "loss": 0.0995, "step": 22140 }, { "epoch": 2.8784925276153346, "grad_norm": 0.28505223989486694, "learning_rate": 2.1215074723846654e-05, "loss": 0.0929, "step": 22150 }, { "epoch": 2.8797920727745288, "grad_norm": 1.0977483987808228, "learning_rate": 2.1202079272254713e-05, "loss": 0.1518, "step": 22160 }, { "epoch": 2.8810916179337234, "grad_norm": 0.4906783401966095, "learning_rate": 2.118908382066277e-05, "loss": 0.1368, "step": 22170 }, { "epoch": 2.8823911630929175, "grad_norm": 0.3641717731952667, "learning_rate": 2.1176088369070825e-05, "loss": 0.1376, "step": 22180 }, { "epoch": 2.8836907082521117, "grad_norm": 1.225648045539856, "learning_rate": 2.1163092917478884e-05, "loss": 0.2276, "step": 22190 }, { "epoch": 2.884990253411306, "grad_norm": 0.642654538154602, "learning_rate": 2.115009746588694e-05, "loss": 0.0989, "step": 22200 }, { "epoch": 2.8862897985705, "grad_norm": 0.7119883894920349, "learning_rate": 2.1137102014295e-05, "loss": 0.1673, "step": 22210 }, { "epoch": 2.8875893437296947, "grad_norm": 0.3349597454071045, "learning_rate": 2.1124106562703055e-05, "loss": 0.1802, "step": 22220 }, { "epoch": 2.888888888888889, "grad_norm": 0.6835513114929199, "learning_rate": 2.111111111111111e-05, "loss": 0.0811, "step": 22230 }, { "epoch": 2.890188434048083, "grad_norm": 0.5001293420791626, "learning_rate": 2.109811565951917e-05, "loss": 0.201, "step": 22240 }, { "epoch": 2.8914879792072776, "grad_norm": 0.2692873477935791, "learning_rate": 2.1085120207927226e-05, "loss": 0.2571, "step": 22250 }, { "epoch": 2.892787524366472, "grad_norm": 1.1461564302444458, "learning_rate": 2.1072124756335285e-05, "loss": 0.1872, "step": 22260 }, { "epoch": 2.894087069525666, "grad_norm": 0.5326513051986694, "learning_rate": 2.105912930474334e-05, "loss": 0.1337, "step": 22270 }, { "epoch": 2.89538661468486, "grad_norm": 0.40033450722694397, "learning_rate": 2.1046133853151397e-05, "loss": 0.0968, "step": 22280 }, { "epoch": 2.8966861598440543, "grad_norm": 0.18071970343589783, "learning_rate": 2.1033138401559456e-05, "loss": 0.1293, "step": 22290 }, { "epoch": 2.897985705003249, "grad_norm": 0.24095745384693146, "learning_rate": 2.1020142949967512e-05, "loss": 0.1124, "step": 22300 }, { "epoch": 2.899285250162443, "grad_norm": 0.22735489904880524, "learning_rate": 2.100714749837557e-05, "loss": 0.1307, "step": 22310 }, { "epoch": 2.9005847953216373, "grad_norm": 0.6823194026947021, "learning_rate": 2.0994152046783627e-05, "loss": 0.1195, "step": 22320 }, { "epoch": 2.901884340480832, "grad_norm": 0.7727157473564148, "learning_rate": 2.0981156595191683e-05, "loss": 0.1707, "step": 22330 }, { "epoch": 2.903183885640026, "grad_norm": 0.6865425109863281, "learning_rate": 2.0968161143599742e-05, "loss": 0.1176, "step": 22340 }, { "epoch": 2.9044834307992202, "grad_norm": 0.4567386209964752, "learning_rate": 2.0955165692007798e-05, "loss": 0.2196, "step": 22350 }, { "epoch": 2.9057829759584144, "grad_norm": 0.3828189969062805, "learning_rate": 2.0942170240415857e-05, "loss": 0.1734, "step": 22360 }, { "epoch": 2.9070825211176086, "grad_norm": 0.37845006585121155, "learning_rate": 2.0929174788823913e-05, "loss": 0.126, "step": 22370 }, { "epoch": 2.908382066276803, "grad_norm": 0.5314100384712219, "learning_rate": 2.091617933723197e-05, "loss": 0.131, "step": 22380 }, { "epoch": 2.9096816114359974, "grad_norm": 0.5298073887825012, "learning_rate": 2.0903183885640025e-05, "loss": 0.1646, "step": 22390 }, { "epoch": 2.9109811565951915, "grad_norm": 0.2872404158115387, "learning_rate": 2.0890188434048084e-05, "loss": 0.1316, "step": 22400 }, { "epoch": 2.912280701754386, "grad_norm": 0.4286971986293793, "learning_rate": 2.0877192982456143e-05, "loss": 0.1734, "step": 22410 }, { "epoch": 2.9135802469135803, "grad_norm": 0.4710727632045746, "learning_rate": 2.08641975308642e-05, "loss": 0.1252, "step": 22420 }, { "epoch": 2.9148797920727745, "grad_norm": 1.0966509580612183, "learning_rate": 2.0851202079272255e-05, "loss": 0.2226, "step": 22430 }, { "epoch": 2.9161793372319686, "grad_norm": 0.3798547685146332, "learning_rate": 2.083820662768031e-05, "loss": 0.126, "step": 22440 }, { "epoch": 2.917478882391163, "grad_norm": 0.33570149540901184, "learning_rate": 2.082521117608837e-05, "loss": 0.1495, "step": 22450 }, { "epoch": 2.9187784275503574, "grad_norm": 0.9347420334815979, "learning_rate": 2.081221572449643e-05, "loss": 0.2892, "step": 22460 }, { "epoch": 2.9200779727095516, "grad_norm": 0.3632754981517792, "learning_rate": 2.0799220272904485e-05, "loss": 0.1356, "step": 22470 }, { "epoch": 2.9213775178687458, "grad_norm": 0.2607164978981018, "learning_rate": 2.078622482131254e-05, "loss": 0.1265, "step": 22480 }, { "epoch": 2.9226770630279404, "grad_norm": 0.5788735151290894, "learning_rate": 2.0773229369720597e-05, "loss": 0.0938, "step": 22490 }, { "epoch": 2.9239766081871346, "grad_norm": 0.8134664297103882, "learning_rate": 2.0760233918128656e-05, "loss": 0.1285, "step": 22500 }, { "epoch": 2.9252761533463287, "grad_norm": 0.45775455236434937, "learning_rate": 2.0747238466536715e-05, "loss": 0.1718, "step": 22510 }, { "epoch": 2.926575698505523, "grad_norm": 0.1511945277452469, "learning_rate": 2.073424301494477e-05, "loss": 0.1124, "step": 22520 }, { "epoch": 2.927875243664717, "grad_norm": 0.3529638350009918, "learning_rate": 2.0721247563352827e-05, "loss": 0.1241, "step": 22530 }, { "epoch": 2.9291747888239117, "grad_norm": 0.8430107831954956, "learning_rate": 2.0708252111760883e-05, "loss": 0.1445, "step": 22540 }, { "epoch": 2.930474333983106, "grad_norm": 0.21005378663539886, "learning_rate": 2.0695256660168942e-05, "loss": 0.1077, "step": 22550 }, { "epoch": 2.9317738791423, "grad_norm": 0.7176408171653748, "learning_rate": 2.0682261208577e-05, "loss": 0.1081, "step": 22560 }, { "epoch": 2.9330734243014946, "grad_norm": 1.5221245288848877, "learning_rate": 2.0669265756985057e-05, "loss": 0.2152, "step": 22570 }, { "epoch": 2.934372969460689, "grad_norm": 0.6516498923301697, "learning_rate": 2.0656270305393113e-05, "loss": 0.1492, "step": 22580 }, { "epoch": 2.935672514619883, "grad_norm": 0.4447478652000427, "learning_rate": 2.064327485380117e-05, "loss": 0.1534, "step": 22590 }, { "epoch": 2.936972059779077, "grad_norm": 0.7820769548416138, "learning_rate": 2.0630279402209228e-05, "loss": 0.149, "step": 22600 }, { "epoch": 2.9382716049382713, "grad_norm": 0.25009286403656006, "learning_rate": 2.0617283950617287e-05, "loss": 0.1431, "step": 22610 }, { "epoch": 2.939571150097466, "grad_norm": 0.27069318294525146, "learning_rate": 2.0604288499025343e-05, "loss": 0.1061, "step": 22620 }, { "epoch": 2.94087069525666, "grad_norm": 0.38477572798728943, "learning_rate": 2.05912930474334e-05, "loss": 0.1165, "step": 22630 }, { "epoch": 2.9421702404158543, "grad_norm": 0.4533068537712097, "learning_rate": 2.0578297595841455e-05, "loss": 0.1422, "step": 22640 }, { "epoch": 2.943469785575049, "grad_norm": 0.5766756534576416, "learning_rate": 2.056530214424951e-05, "loss": 0.1224, "step": 22650 }, { "epoch": 2.944769330734243, "grad_norm": 0.5352323651313782, "learning_rate": 2.0552306692657574e-05, "loss": 0.1182, "step": 22660 }, { "epoch": 2.9460688758934372, "grad_norm": 0.603349506855011, "learning_rate": 2.053931124106563e-05, "loss": 0.1542, "step": 22670 }, { "epoch": 2.9473684210526314, "grad_norm": 0.2940896451473236, "learning_rate": 2.0526315789473685e-05, "loss": 0.1357, "step": 22680 }, { "epoch": 2.9486679662118256, "grad_norm": 0.5986506342887878, "learning_rate": 2.051332033788174e-05, "loss": 0.1309, "step": 22690 }, { "epoch": 2.94996751137102, "grad_norm": 0.37104398012161255, "learning_rate": 2.0500324886289797e-05, "loss": 0.1345, "step": 22700 }, { "epoch": 2.9512670565302144, "grad_norm": 0.3605065643787384, "learning_rate": 2.0487329434697856e-05, "loss": 0.1505, "step": 22710 }, { "epoch": 2.9525666016894085, "grad_norm": 0.5718125700950623, "learning_rate": 2.0474333983105915e-05, "loss": 0.1119, "step": 22720 }, { "epoch": 2.953866146848603, "grad_norm": 0.7282840013504028, "learning_rate": 2.046133853151397e-05, "loss": 0.1352, "step": 22730 }, { "epoch": 2.9551656920077973, "grad_norm": 0.42170441150665283, "learning_rate": 2.0448343079922027e-05, "loss": 0.0781, "step": 22740 }, { "epoch": 2.9564652371669915, "grad_norm": 0.5929535627365112, "learning_rate": 2.0435347628330083e-05, "loss": 0.1738, "step": 22750 }, { "epoch": 2.9577647823261857, "grad_norm": 1.0183204412460327, "learning_rate": 2.0422352176738142e-05, "loss": 0.1636, "step": 22760 }, { "epoch": 2.95906432748538, "grad_norm": 0.7220844626426697, "learning_rate": 2.04093567251462e-05, "loss": 0.1582, "step": 22770 }, { "epoch": 2.9603638726445745, "grad_norm": 0.27884235978126526, "learning_rate": 2.0396361273554257e-05, "loss": 0.1167, "step": 22780 }, { "epoch": 2.9616634178037686, "grad_norm": 0.2078397423028946, "learning_rate": 2.0383365821962313e-05, "loss": 0.1636, "step": 22790 }, { "epoch": 2.962962962962963, "grad_norm": 0.39957287907600403, "learning_rate": 2.037037037037037e-05, "loss": 0.0832, "step": 22800 }, { "epoch": 2.9642625081221574, "grad_norm": 0.5017943382263184, "learning_rate": 2.035737491877843e-05, "loss": 0.1341, "step": 22810 }, { "epoch": 2.9655620532813516, "grad_norm": 0.22774629294872284, "learning_rate": 2.0344379467186488e-05, "loss": 0.1409, "step": 22820 }, { "epoch": 2.9668615984405458, "grad_norm": 0.36921629309654236, "learning_rate": 2.0331384015594543e-05, "loss": 0.1223, "step": 22830 }, { "epoch": 2.96816114359974, "grad_norm": 0.3813703656196594, "learning_rate": 2.03183885640026e-05, "loss": 0.0928, "step": 22840 }, { "epoch": 2.969460688758934, "grad_norm": 0.3078412115573883, "learning_rate": 2.0305393112410655e-05, "loss": 0.2064, "step": 22850 }, { "epoch": 2.9707602339181287, "grad_norm": 0.3858896493911743, "learning_rate": 2.0292397660818714e-05, "loss": 0.1209, "step": 22860 }, { "epoch": 2.972059779077323, "grad_norm": 0.5300585031509399, "learning_rate": 2.0279402209226774e-05, "loss": 0.1326, "step": 22870 }, { "epoch": 2.973359324236517, "grad_norm": 0.21899940073490143, "learning_rate": 2.026640675763483e-05, "loss": 0.0781, "step": 22880 }, { "epoch": 2.9746588693957117, "grad_norm": 0.40345820784568787, "learning_rate": 2.0253411306042885e-05, "loss": 0.1015, "step": 22890 }, { "epoch": 2.975958414554906, "grad_norm": 1.0039652585983276, "learning_rate": 2.024041585445094e-05, "loss": 0.2327, "step": 22900 }, { "epoch": 2.9772579597141, "grad_norm": 0.3952500522136688, "learning_rate": 2.0227420402859e-05, "loss": 0.1873, "step": 22910 }, { "epoch": 2.978557504873294, "grad_norm": 0.7851987481117249, "learning_rate": 2.021442495126706e-05, "loss": 0.1868, "step": 22920 }, { "epoch": 2.9798570500324884, "grad_norm": 0.4514578580856323, "learning_rate": 2.0201429499675116e-05, "loss": 0.1416, "step": 22930 }, { "epoch": 2.981156595191683, "grad_norm": 0.9644458293914795, "learning_rate": 2.018843404808317e-05, "loss": 0.1671, "step": 22940 }, { "epoch": 2.982456140350877, "grad_norm": 0.5297955870628357, "learning_rate": 2.0175438596491227e-05, "loss": 0.1481, "step": 22950 }, { "epoch": 2.9837556855100713, "grad_norm": 0.19380001723766327, "learning_rate": 2.0162443144899286e-05, "loss": 0.1709, "step": 22960 }, { "epoch": 2.985055230669266, "grad_norm": 0.8199558854103088, "learning_rate": 2.0149447693307342e-05, "loss": 0.2256, "step": 22970 }, { "epoch": 2.98635477582846, "grad_norm": 0.8140642046928406, "learning_rate": 2.01364522417154e-05, "loss": 0.1623, "step": 22980 }, { "epoch": 2.9876543209876543, "grad_norm": 0.28020018339157104, "learning_rate": 2.0123456790123457e-05, "loss": 0.142, "step": 22990 }, { "epoch": 2.9889538661468484, "grad_norm": 0.3969995975494385, "learning_rate": 2.0110461338531513e-05, "loss": 0.1179, "step": 23000 }, { "epoch": 2.9902534113060426, "grad_norm": 0.5141519904136658, "learning_rate": 2.0097465886939573e-05, "loss": 0.1217, "step": 23010 }, { "epoch": 2.9915529564652372, "grad_norm": 0.7802230715751648, "learning_rate": 2.008447043534763e-05, "loss": 0.3229, "step": 23020 }, { "epoch": 2.9928525016244314, "grad_norm": 0.3817462921142578, "learning_rate": 2.0071474983755688e-05, "loss": 0.0978, "step": 23030 }, { "epoch": 2.9941520467836256, "grad_norm": 0.6673833131790161, "learning_rate": 2.0058479532163744e-05, "loss": 0.219, "step": 23040 }, { "epoch": 2.99545159194282, "grad_norm": 1.0881553888320923, "learning_rate": 2.00454840805718e-05, "loss": 0.1579, "step": 23050 }, { "epoch": 2.9967511371020144, "grad_norm": 0.295635461807251, "learning_rate": 2.003248862897986e-05, "loss": 0.1189, "step": 23060 }, { "epoch": 2.9980506822612085, "grad_norm": 0.12339892983436584, "learning_rate": 2.0019493177387914e-05, "loss": 0.1023, "step": 23070 }, { "epoch": 2.9993502274204027, "grad_norm": 0.39136752486228943, "learning_rate": 2.0006497725795974e-05, "loss": 0.1688, "step": 23080 }, { "epoch": 3.0, "eval_loss": 0.10019104182720184, "eval_runtime": 852.2866, "eval_samples_per_second": 9.029, "eval_steps_per_second": 9.029, "step": 23085 }, { "epoch": 3.0006497725795973, "grad_norm": 0.19491663575172424, "learning_rate": 1.999350227420403e-05, "loss": 0.1689, "step": 23090 }, { "epoch": 3.0019493177387915, "grad_norm": 0.4197196960449219, "learning_rate": 1.9980506822612085e-05, "loss": 0.1037, "step": 23100 }, { "epoch": 3.0032488628979856, "grad_norm": 0.3379548192024231, "learning_rate": 1.9967511371020145e-05, "loss": 0.1322, "step": 23110 }, { "epoch": 3.00454840805718, "grad_norm": 0.3797471821308136, "learning_rate": 1.99545159194282e-05, "loss": 0.1537, "step": 23120 }, { "epoch": 3.0058479532163744, "grad_norm": 0.5199289917945862, "learning_rate": 1.994152046783626e-05, "loss": 0.1138, "step": 23130 }, { "epoch": 3.0071474983755686, "grad_norm": 0.7062774896621704, "learning_rate": 1.9928525016244316e-05, "loss": 0.0918, "step": 23140 }, { "epoch": 3.0084470435347628, "grad_norm": 1.0185409784317017, "learning_rate": 1.991552956465237e-05, "loss": 0.0955, "step": 23150 }, { "epoch": 3.009746588693957, "grad_norm": 1.1230676174163818, "learning_rate": 1.990253411306043e-05, "loss": 0.1661, "step": 23160 }, { "epoch": 3.0110461338531516, "grad_norm": 0.6632408499717712, "learning_rate": 1.9889538661468487e-05, "loss": 0.1379, "step": 23170 }, { "epoch": 3.0123456790123457, "grad_norm": 0.9351013898849487, "learning_rate": 1.9876543209876546e-05, "loss": 0.1384, "step": 23180 }, { "epoch": 3.01364522417154, "grad_norm": 0.31277552247047424, "learning_rate": 1.98635477582846e-05, "loss": 0.0817, "step": 23190 }, { "epoch": 3.014944769330734, "grad_norm": 0.16408218443393707, "learning_rate": 1.9850552306692658e-05, "loss": 0.1161, "step": 23200 }, { "epoch": 3.0162443144899287, "grad_norm": 0.40638235211372375, "learning_rate": 1.9837556855100717e-05, "loss": 0.113, "step": 23210 }, { "epoch": 3.017543859649123, "grad_norm": 0.5466504693031311, "learning_rate": 1.9824561403508773e-05, "loss": 0.1136, "step": 23220 }, { "epoch": 3.018843404808317, "grad_norm": 0.4297522306442261, "learning_rate": 1.981156595191683e-05, "loss": 0.0998, "step": 23230 }, { "epoch": 3.020142949967511, "grad_norm": 0.29351842403411865, "learning_rate": 1.9798570500324888e-05, "loss": 0.103, "step": 23240 }, { "epoch": 3.021442495126706, "grad_norm": 0.3788178563117981, "learning_rate": 1.9785575048732944e-05, "loss": 0.1052, "step": 23250 }, { "epoch": 3.0227420402859, "grad_norm": 0.6665863990783691, "learning_rate": 1.9772579597141003e-05, "loss": 0.2123, "step": 23260 }, { "epoch": 3.024041585445094, "grad_norm": 0.5849595069885254, "learning_rate": 1.975958414554906e-05, "loss": 0.1009, "step": 23270 }, { "epoch": 3.0253411306042883, "grad_norm": 0.395834356546402, "learning_rate": 1.9746588693957115e-05, "loss": 0.1307, "step": 23280 }, { "epoch": 3.026640675763483, "grad_norm": 0.33403530716896057, "learning_rate": 1.9733593242365174e-05, "loss": 0.1294, "step": 23290 }, { "epoch": 3.027940220922677, "grad_norm": 0.29662758111953735, "learning_rate": 1.972059779077323e-05, "loss": 0.1244, "step": 23300 }, { "epoch": 3.0292397660818713, "grad_norm": 0.24916544556617737, "learning_rate": 1.970760233918129e-05, "loss": 0.1241, "step": 23310 }, { "epoch": 3.0305393112410655, "grad_norm": 0.5036481022834778, "learning_rate": 1.9694606887589345e-05, "loss": 0.077, "step": 23320 }, { "epoch": 3.03183885640026, "grad_norm": 0.41734522581100464, "learning_rate": 1.96816114359974e-05, "loss": 0.14, "step": 23330 }, { "epoch": 3.0331384015594542, "grad_norm": 0.3366347551345825, "learning_rate": 1.966861598440546e-05, "loss": 0.0952, "step": 23340 }, { "epoch": 3.0344379467186484, "grad_norm": 0.5328202843666077, "learning_rate": 1.9655620532813516e-05, "loss": 0.1096, "step": 23350 }, { "epoch": 3.0357374918778426, "grad_norm": 0.5512514114379883, "learning_rate": 1.9642625081221575e-05, "loss": 0.1545, "step": 23360 }, { "epoch": 3.037037037037037, "grad_norm": 0.7206323146820068, "learning_rate": 1.962962962962963e-05, "loss": 0.1175, "step": 23370 }, { "epoch": 3.0383365821962314, "grad_norm": 0.8021720051765442, "learning_rate": 1.9616634178037687e-05, "loss": 0.1533, "step": 23380 }, { "epoch": 3.0396361273554255, "grad_norm": 0.5219976305961609, "learning_rate": 1.9603638726445746e-05, "loss": 0.1106, "step": 23390 }, { "epoch": 3.0409356725146197, "grad_norm": 0.2450275719165802, "learning_rate": 1.9590643274853802e-05, "loss": 0.0733, "step": 23400 }, { "epoch": 3.0422352176738143, "grad_norm": 0.4654599130153656, "learning_rate": 1.957764782326186e-05, "loss": 0.2094, "step": 23410 }, { "epoch": 3.0435347628330085, "grad_norm": 0.1837691366672516, "learning_rate": 1.9564652371669917e-05, "loss": 0.121, "step": 23420 }, { "epoch": 3.0448343079922027, "grad_norm": 0.23535442352294922, "learning_rate": 1.9551656920077973e-05, "loss": 0.1158, "step": 23430 }, { "epoch": 3.046133853151397, "grad_norm": 0.3673548102378845, "learning_rate": 1.9538661468486032e-05, "loss": 0.1498, "step": 23440 }, { "epoch": 3.0474333983105915, "grad_norm": 0.20127242803573608, "learning_rate": 1.9525666016894088e-05, "loss": 0.13, "step": 23450 }, { "epoch": 3.0487329434697856, "grad_norm": 0.254807710647583, "learning_rate": 1.9512670565302147e-05, "loss": 0.152, "step": 23460 }, { "epoch": 3.05003248862898, "grad_norm": 0.5179461240768433, "learning_rate": 1.9499675113710203e-05, "loss": 0.1068, "step": 23470 }, { "epoch": 3.051332033788174, "grad_norm": 0.34533393383026123, "learning_rate": 1.948667966211826e-05, "loss": 0.1009, "step": 23480 }, { "epoch": 3.0526315789473686, "grad_norm": 0.2751319706439972, "learning_rate": 1.9473684210526315e-05, "loss": 0.1513, "step": 23490 }, { "epoch": 3.0539311241065628, "grad_norm": 0.14880076050758362, "learning_rate": 1.9460688758934374e-05, "loss": 0.073, "step": 23500 }, { "epoch": 3.055230669265757, "grad_norm": 0.3198152780532837, "learning_rate": 1.9447693307342433e-05, "loss": 0.0968, "step": 23510 }, { "epoch": 3.056530214424951, "grad_norm": 0.7202719449996948, "learning_rate": 1.943469785575049e-05, "loss": 0.1362, "step": 23520 }, { "epoch": 3.0578297595841457, "grad_norm": 0.5668863654136658, "learning_rate": 1.9421702404158545e-05, "loss": 0.1961, "step": 23530 }, { "epoch": 3.05912930474334, "grad_norm": 0.8878369331359863, "learning_rate": 1.94087069525666e-05, "loss": 0.1271, "step": 23540 }, { "epoch": 3.060428849902534, "grad_norm": 0.42442476749420166, "learning_rate": 1.939571150097466e-05, "loss": 0.0926, "step": 23550 }, { "epoch": 3.0617283950617282, "grad_norm": 0.6778298616409302, "learning_rate": 1.938271604938272e-05, "loss": 0.1081, "step": 23560 }, { "epoch": 3.063027940220923, "grad_norm": 0.3457668125629425, "learning_rate": 1.9369720597790775e-05, "loss": 0.0805, "step": 23570 }, { "epoch": 3.064327485380117, "grad_norm": 0.2574026584625244, "learning_rate": 1.935672514619883e-05, "loss": 0.1141, "step": 23580 }, { "epoch": 3.065627030539311, "grad_norm": 0.24309398233890533, "learning_rate": 1.9343729694606887e-05, "loss": 0.1203, "step": 23590 }, { "epoch": 3.0669265756985054, "grad_norm": 0.47201159596443176, "learning_rate": 1.9330734243014946e-05, "loss": 0.1453, "step": 23600 }, { "epoch": 3.0682261208577, "grad_norm": 0.5392578840255737, "learning_rate": 1.9317738791423005e-05, "loss": 0.0993, "step": 23610 }, { "epoch": 3.069525666016894, "grad_norm": 1.2113598585128784, "learning_rate": 1.930474333983106e-05, "loss": 0.1242, "step": 23620 }, { "epoch": 3.0708252111760883, "grad_norm": 0.20692571997642517, "learning_rate": 1.9291747888239117e-05, "loss": 0.0994, "step": 23630 }, { "epoch": 3.0721247563352825, "grad_norm": 0.46313026547431946, "learning_rate": 1.9278752436647173e-05, "loss": 0.0773, "step": 23640 }, { "epoch": 3.073424301494477, "grad_norm": 0.1307726800441742, "learning_rate": 1.9265756985055232e-05, "loss": 0.1556, "step": 23650 }, { "epoch": 3.0747238466536713, "grad_norm": 0.29995203018188477, "learning_rate": 1.925276153346329e-05, "loss": 0.1162, "step": 23660 }, { "epoch": 3.0760233918128654, "grad_norm": 0.5025638341903687, "learning_rate": 1.9239766081871347e-05, "loss": 0.0868, "step": 23670 }, { "epoch": 3.0773229369720596, "grad_norm": 1.4581894874572754, "learning_rate": 1.9226770630279403e-05, "loss": 0.14, "step": 23680 }, { "epoch": 3.0786224821312542, "grad_norm": 0.736814558506012, "learning_rate": 1.921377517868746e-05, "loss": 0.1262, "step": 23690 }, { "epoch": 3.0799220272904484, "grad_norm": 0.3992776572704315, "learning_rate": 1.9200779727095518e-05, "loss": 0.0914, "step": 23700 }, { "epoch": 3.0812215724496426, "grad_norm": 0.3053443431854248, "learning_rate": 1.9187784275503577e-05, "loss": 0.115, "step": 23710 }, { "epoch": 3.0825211176088367, "grad_norm": 0.22565311193466187, "learning_rate": 1.9174788823911633e-05, "loss": 0.1396, "step": 23720 }, { "epoch": 3.0838206627680314, "grad_norm": 0.5885831117630005, "learning_rate": 1.916179337231969e-05, "loss": 0.1233, "step": 23730 }, { "epoch": 3.0851202079272255, "grad_norm": 0.37310144305229187, "learning_rate": 1.9148797920727745e-05, "loss": 0.0988, "step": 23740 }, { "epoch": 3.0864197530864197, "grad_norm": 0.40382930636405945, "learning_rate": 1.91358024691358e-05, "loss": 0.1515, "step": 23750 }, { "epoch": 3.087719298245614, "grad_norm": 0.5291951298713684, "learning_rate": 1.9122807017543863e-05, "loss": 0.1262, "step": 23760 }, { "epoch": 3.0890188434048085, "grad_norm": 0.11046644300222397, "learning_rate": 1.910981156595192e-05, "loss": 0.1504, "step": 23770 }, { "epoch": 3.0903183885640026, "grad_norm": 0.86225825548172, "learning_rate": 1.9096816114359975e-05, "loss": 0.1314, "step": 23780 }, { "epoch": 3.091617933723197, "grad_norm": 0.31821930408477783, "learning_rate": 1.908382066276803e-05, "loss": 0.1746, "step": 23790 }, { "epoch": 3.092917478882391, "grad_norm": 0.9199597239494324, "learning_rate": 1.9070825211176087e-05, "loss": 0.1146, "step": 23800 }, { "epoch": 3.0942170240415856, "grad_norm": 0.9043316841125488, "learning_rate": 1.9057829759584146e-05, "loss": 0.1348, "step": 23810 }, { "epoch": 3.0955165692007798, "grad_norm": 0.5627151727676392, "learning_rate": 1.9044834307992205e-05, "loss": 0.1068, "step": 23820 }, { "epoch": 3.096816114359974, "grad_norm": 1.1145998239517212, "learning_rate": 1.903183885640026e-05, "loss": 0.1312, "step": 23830 }, { "epoch": 3.098115659519168, "grad_norm": 0.7664873003959656, "learning_rate": 1.9018843404808317e-05, "loss": 0.0945, "step": 23840 }, { "epoch": 3.0994152046783627, "grad_norm": 0.4605230391025543, "learning_rate": 1.9005847953216373e-05, "loss": 0.1444, "step": 23850 }, { "epoch": 3.100714749837557, "grad_norm": 0.22441445291042328, "learning_rate": 1.8992852501624432e-05, "loss": 0.1398, "step": 23860 }, { "epoch": 3.102014294996751, "grad_norm": 0.6064275503158569, "learning_rate": 1.897985705003249e-05, "loss": 0.1978, "step": 23870 }, { "epoch": 3.1033138401559452, "grad_norm": 0.6020766496658325, "learning_rate": 1.8966861598440547e-05, "loss": 0.3237, "step": 23880 }, { "epoch": 3.10461338531514, "grad_norm": 0.15798528492450714, "learning_rate": 1.8953866146848603e-05, "loss": 0.1157, "step": 23890 }, { "epoch": 3.105912930474334, "grad_norm": 0.34893351793289185, "learning_rate": 1.894087069525666e-05, "loss": 0.1001, "step": 23900 }, { "epoch": 3.107212475633528, "grad_norm": 1.6883172988891602, "learning_rate": 1.8927875243664718e-05, "loss": 0.1548, "step": 23910 }, { "epoch": 3.1085120207927224, "grad_norm": 0.5937908291816711, "learning_rate": 1.8914879792072777e-05, "loss": 0.1166, "step": 23920 }, { "epoch": 3.109811565951917, "grad_norm": 0.2412593513727188, "learning_rate": 1.8901884340480833e-05, "loss": 0.122, "step": 23930 }, { "epoch": 3.111111111111111, "grad_norm": 1.1435216665267944, "learning_rate": 1.888888888888889e-05, "loss": 0.1644, "step": 23940 }, { "epoch": 3.1124106562703053, "grad_norm": 0.5297419428825378, "learning_rate": 1.8875893437296945e-05, "loss": 0.0905, "step": 23950 }, { "epoch": 3.1137102014294995, "grad_norm": 0.33782124519348145, "learning_rate": 1.8862897985705004e-05, "loss": 0.0978, "step": 23960 }, { "epoch": 3.115009746588694, "grad_norm": 0.16917681694030762, "learning_rate": 1.8849902534113063e-05, "loss": 0.1381, "step": 23970 }, { "epoch": 3.1163092917478883, "grad_norm": 0.7122665643692017, "learning_rate": 1.883690708252112e-05, "loss": 0.1057, "step": 23980 }, { "epoch": 3.1176088369070825, "grad_norm": 0.7536989450454712, "learning_rate": 1.8823911630929175e-05, "loss": 0.1368, "step": 23990 }, { "epoch": 3.1189083820662766, "grad_norm": 0.4185729920864105, "learning_rate": 1.881091617933723e-05, "loss": 0.1212, "step": 24000 }, { "epoch": 3.1202079272254712, "grad_norm": 0.17294780910015106, "learning_rate": 1.879792072774529e-05, "loss": 0.122, "step": 24010 }, { "epoch": 3.1215074723846654, "grad_norm": 0.6365562081336975, "learning_rate": 1.878492527615335e-05, "loss": 0.1082, "step": 24020 }, { "epoch": 3.1228070175438596, "grad_norm": 0.48872557282447815, "learning_rate": 1.8771929824561405e-05, "loss": 0.137, "step": 24030 }, { "epoch": 3.1241065627030538, "grad_norm": 0.668732225894928, "learning_rate": 1.875893437296946e-05, "loss": 0.084, "step": 24040 }, { "epoch": 3.1254061078622484, "grad_norm": 0.743445634841919, "learning_rate": 1.8745938921377517e-05, "loss": 0.1188, "step": 24050 }, { "epoch": 3.1267056530214425, "grad_norm": 0.45885807275772095, "learning_rate": 1.8732943469785576e-05, "loss": 0.1294, "step": 24060 }, { "epoch": 3.1280051981806367, "grad_norm": 0.580765962600708, "learning_rate": 1.8719948018193632e-05, "loss": 0.0945, "step": 24070 }, { "epoch": 3.129304743339831, "grad_norm": 0.6282307505607605, "learning_rate": 1.870695256660169e-05, "loss": 0.1444, "step": 24080 }, { "epoch": 3.1306042884990255, "grad_norm": 0.26650041341781616, "learning_rate": 1.8693957115009747e-05, "loss": 0.1149, "step": 24090 }, { "epoch": 3.1319038336582197, "grad_norm": 0.4342084527015686, "learning_rate": 1.8680961663417803e-05, "loss": 0.1225, "step": 24100 }, { "epoch": 3.133203378817414, "grad_norm": 0.3555283546447754, "learning_rate": 1.8667966211825862e-05, "loss": 0.1047, "step": 24110 }, { "epoch": 3.134502923976608, "grad_norm": 0.9767182469367981, "learning_rate": 1.8654970760233918e-05, "loss": 0.1043, "step": 24120 }, { "epoch": 3.1358024691358026, "grad_norm": 0.6637300848960876, "learning_rate": 1.8641975308641977e-05, "loss": 0.103, "step": 24130 }, { "epoch": 3.137102014294997, "grad_norm": 1.1984525918960571, "learning_rate": 1.8628979857050033e-05, "loss": 0.1851, "step": 24140 }, { "epoch": 3.138401559454191, "grad_norm": 1.1785038709640503, "learning_rate": 1.861598440545809e-05, "loss": 0.1583, "step": 24150 }, { "epoch": 3.139701104613385, "grad_norm": 0.4537869989871979, "learning_rate": 1.860298895386615e-05, "loss": 0.1518, "step": 24160 }, { "epoch": 3.1410006497725798, "grad_norm": 0.30634060502052307, "learning_rate": 1.8589993502274204e-05, "loss": 0.1001, "step": 24170 }, { "epoch": 3.142300194931774, "grad_norm": 0.5786457061767578, "learning_rate": 1.8576998050682264e-05, "loss": 0.0815, "step": 24180 }, { "epoch": 3.143599740090968, "grad_norm": 0.8024723529815674, "learning_rate": 1.856400259909032e-05, "loss": 0.1389, "step": 24190 }, { "epoch": 3.1448992852501623, "grad_norm": 0.4762561619281769, "learning_rate": 1.8551007147498375e-05, "loss": 0.1098, "step": 24200 }, { "epoch": 3.146198830409357, "grad_norm": 0.43103721737861633, "learning_rate": 1.8538011695906434e-05, "loss": 0.1428, "step": 24210 }, { "epoch": 3.147498375568551, "grad_norm": 0.38501062989234924, "learning_rate": 1.852501624431449e-05, "loss": 0.1611, "step": 24220 }, { "epoch": 3.1487979207277452, "grad_norm": 0.5097856521606445, "learning_rate": 1.851202079272255e-05, "loss": 0.0776, "step": 24230 }, { "epoch": 3.1500974658869394, "grad_norm": 0.6945882439613342, "learning_rate": 1.8499025341130605e-05, "loss": 0.0878, "step": 24240 }, { "epoch": 3.151397011046134, "grad_norm": 0.4615586996078491, "learning_rate": 1.848602988953866e-05, "loss": 0.1902, "step": 24250 }, { "epoch": 3.152696556205328, "grad_norm": 0.8051995635032654, "learning_rate": 1.847303443794672e-05, "loss": 0.1058, "step": 24260 }, { "epoch": 3.1539961013645224, "grad_norm": 0.6224893927574158, "learning_rate": 1.8460038986354776e-05, "loss": 0.2591, "step": 24270 }, { "epoch": 3.1552956465237165, "grad_norm": 0.27153903245925903, "learning_rate": 1.8447043534762836e-05, "loss": 0.1821, "step": 24280 }, { "epoch": 3.156595191682911, "grad_norm": 0.26252830028533936, "learning_rate": 1.843404808317089e-05, "loss": 0.1301, "step": 24290 }, { "epoch": 3.1578947368421053, "grad_norm": 0.5724856853485107, "learning_rate": 1.8421052631578947e-05, "loss": 0.1207, "step": 24300 }, { "epoch": 3.1591942820012995, "grad_norm": 0.3952215611934662, "learning_rate": 1.8408057179987007e-05, "loss": 0.1385, "step": 24310 }, { "epoch": 3.1604938271604937, "grad_norm": 0.6879607439041138, "learning_rate": 1.8395061728395062e-05, "loss": 0.1279, "step": 24320 }, { "epoch": 3.1617933723196883, "grad_norm": 0.5807758569717407, "learning_rate": 1.838206627680312e-05, "loss": 0.0906, "step": 24330 }, { "epoch": 3.1630929174788824, "grad_norm": 0.6032560467720032, "learning_rate": 1.8369070825211178e-05, "loss": 0.0974, "step": 24340 }, { "epoch": 3.1643924626380766, "grad_norm": 1.0369348526000977, "learning_rate": 1.8356075373619233e-05, "loss": 0.1094, "step": 24350 }, { "epoch": 3.165692007797271, "grad_norm": 1.0153614282608032, "learning_rate": 1.8343079922027293e-05, "loss": 0.1794, "step": 24360 }, { "epoch": 3.1669915529564654, "grad_norm": 0.5165131688117981, "learning_rate": 1.833008447043535e-05, "loss": 0.1169, "step": 24370 }, { "epoch": 3.1682910981156596, "grad_norm": 0.2246917337179184, "learning_rate": 1.8317089018843404e-05, "loss": 0.1056, "step": 24380 }, { "epoch": 3.1695906432748537, "grad_norm": 0.49698784947395325, "learning_rate": 1.8304093567251464e-05, "loss": 0.1041, "step": 24390 }, { "epoch": 3.170890188434048, "grad_norm": 0.4028409421443939, "learning_rate": 1.829109811565952e-05, "loss": 0.2034, "step": 24400 }, { "epoch": 3.1721897335932425, "grad_norm": 0.14522278308868408, "learning_rate": 1.827810266406758e-05, "loss": 0.1381, "step": 24410 }, { "epoch": 3.1734892787524367, "grad_norm": 0.28312450647354126, "learning_rate": 1.8265107212475635e-05, "loss": 0.1175, "step": 24420 }, { "epoch": 3.174788823911631, "grad_norm": 0.6201525330543518, "learning_rate": 1.825211176088369e-05, "loss": 0.1495, "step": 24430 }, { "epoch": 3.176088369070825, "grad_norm": 0.18772590160369873, "learning_rate": 1.823911630929175e-05, "loss": 0.1161, "step": 24440 }, { "epoch": 3.1773879142300196, "grad_norm": 0.9260114431381226, "learning_rate": 1.8226120857699806e-05, "loss": 0.1465, "step": 24450 }, { "epoch": 3.178687459389214, "grad_norm": 0.5036524534225464, "learning_rate": 1.8213125406107865e-05, "loss": 0.1253, "step": 24460 }, { "epoch": 3.179987004548408, "grad_norm": 0.11712438613176346, "learning_rate": 1.820012995451592e-05, "loss": 0.131, "step": 24470 }, { "epoch": 3.181286549707602, "grad_norm": 0.1250913292169571, "learning_rate": 1.8187134502923976e-05, "loss": 0.1018, "step": 24480 }, { "epoch": 3.1825860948667968, "grad_norm": 0.41125330328941345, "learning_rate": 1.8174139051332036e-05, "loss": 0.0872, "step": 24490 }, { "epoch": 3.183885640025991, "grad_norm": 0.5190581679344177, "learning_rate": 1.816114359974009e-05, "loss": 0.1465, "step": 24500 }, { "epoch": 3.185185185185185, "grad_norm": 0.43935924768447876, "learning_rate": 1.814814814814815e-05, "loss": 0.1104, "step": 24510 }, { "epoch": 3.1864847303443793, "grad_norm": 0.42558181285858154, "learning_rate": 1.8135152696556207e-05, "loss": 0.0976, "step": 24520 }, { "epoch": 3.187784275503574, "grad_norm": 0.8748669028282166, "learning_rate": 1.8122157244964263e-05, "loss": 0.1287, "step": 24530 }, { "epoch": 3.189083820662768, "grad_norm": 0.7081230282783508, "learning_rate": 1.8109161793372322e-05, "loss": 0.1126, "step": 24540 }, { "epoch": 3.1903833658219622, "grad_norm": 0.3370561897754669, "learning_rate": 1.8096166341780378e-05, "loss": 0.1622, "step": 24550 }, { "epoch": 3.1916829109811564, "grad_norm": 0.722572386264801, "learning_rate": 1.8083170890188437e-05, "loss": 0.1379, "step": 24560 }, { "epoch": 3.192982456140351, "grad_norm": 0.3738165497779846, "learning_rate": 1.8070175438596493e-05, "loss": 0.0882, "step": 24570 }, { "epoch": 3.194282001299545, "grad_norm": 1.077443242073059, "learning_rate": 1.805717998700455e-05, "loss": 0.2056, "step": 24580 }, { "epoch": 3.1955815464587394, "grad_norm": 0.481641560792923, "learning_rate": 1.8044184535412604e-05, "loss": 0.1284, "step": 24590 }, { "epoch": 3.1968810916179335, "grad_norm": 0.8076510429382324, "learning_rate": 1.8031189083820664e-05, "loss": 0.1879, "step": 24600 }, { "epoch": 3.198180636777128, "grad_norm": 0.6977196335792542, "learning_rate": 1.8018193632228723e-05, "loss": 0.1519, "step": 24610 }, { "epoch": 3.1994801819363223, "grad_norm": 0.8845590353012085, "learning_rate": 1.800519818063678e-05, "loss": 0.1866, "step": 24620 }, { "epoch": 3.2007797270955165, "grad_norm": 0.8850059509277344, "learning_rate": 1.7992202729044835e-05, "loss": 0.1067, "step": 24630 }, { "epoch": 3.2020792722547107, "grad_norm": 1.181011438369751, "learning_rate": 1.797920727745289e-05, "loss": 0.1305, "step": 24640 }, { "epoch": 3.2033788174139053, "grad_norm": 0.9063879251480103, "learning_rate": 1.796621182586095e-05, "loss": 0.1167, "step": 24650 }, { "epoch": 3.2046783625730995, "grad_norm": 0.22431302070617676, "learning_rate": 1.795321637426901e-05, "loss": 0.1112, "step": 24660 }, { "epoch": 3.2059779077322936, "grad_norm": 0.12858910858631134, "learning_rate": 1.7940220922677065e-05, "loss": 0.0709, "step": 24670 }, { "epoch": 3.207277452891488, "grad_norm": 0.3249017596244812, "learning_rate": 1.792722547108512e-05, "loss": 0.1051, "step": 24680 }, { "epoch": 3.2085769980506824, "grad_norm": 0.6528042554855347, "learning_rate": 1.7914230019493177e-05, "loss": 0.1232, "step": 24690 }, { "epoch": 3.2098765432098766, "grad_norm": 0.2795039117336273, "learning_rate": 1.7901234567901236e-05, "loss": 0.1328, "step": 24700 }, { "epoch": 3.2111760883690708, "grad_norm": 0.9122927188873291, "learning_rate": 1.7888239116309295e-05, "loss": 0.1292, "step": 24710 }, { "epoch": 3.212475633528265, "grad_norm": 0.2685626447200775, "learning_rate": 1.787524366471735e-05, "loss": 0.0974, "step": 24720 }, { "epoch": 3.2137751786874595, "grad_norm": 0.5914322733879089, "learning_rate": 1.7862248213125407e-05, "loss": 0.132, "step": 24730 }, { "epoch": 3.2150747238466537, "grad_norm": 0.6574431657791138, "learning_rate": 1.7849252761533463e-05, "loss": 0.1159, "step": 24740 }, { "epoch": 3.216374269005848, "grad_norm": 0.5285223126411438, "learning_rate": 1.7836257309941522e-05, "loss": 0.1667, "step": 24750 }, { "epoch": 3.217673814165042, "grad_norm": 1.7608710527420044, "learning_rate": 1.782326185834958e-05, "loss": 0.1978, "step": 24760 }, { "epoch": 3.2189733593242367, "grad_norm": 0.4087387025356293, "learning_rate": 1.7810266406757637e-05, "loss": 0.1411, "step": 24770 }, { "epoch": 3.220272904483431, "grad_norm": 0.3451167941093445, "learning_rate": 1.7797270955165693e-05, "loss": 0.1582, "step": 24780 }, { "epoch": 3.221572449642625, "grad_norm": 0.3548254668712616, "learning_rate": 1.778427550357375e-05, "loss": 0.1156, "step": 24790 }, { "epoch": 3.222871994801819, "grad_norm": 0.6829783320426941, "learning_rate": 1.7771280051981808e-05, "loss": 0.1125, "step": 24800 }, { "epoch": 3.224171539961014, "grad_norm": 0.2977861166000366, "learning_rate": 1.7758284600389867e-05, "loss": 0.1106, "step": 24810 }, { "epoch": 3.225471085120208, "grad_norm": 0.15655678510665894, "learning_rate": 1.7745289148797923e-05, "loss": 0.1254, "step": 24820 }, { "epoch": 3.226770630279402, "grad_norm": 0.36795657873153687, "learning_rate": 1.773229369720598e-05, "loss": 0.1028, "step": 24830 }, { "epoch": 3.2280701754385963, "grad_norm": 0.8019411563873291, "learning_rate": 1.7719298245614035e-05, "loss": 0.1562, "step": 24840 }, { "epoch": 3.229369720597791, "grad_norm": 0.4231128692626953, "learning_rate": 1.770630279402209e-05, "loss": 0.116, "step": 24850 }, { "epoch": 3.230669265756985, "grad_norm": 0.4318840801715851, "learning_rate": 1.7693307342430153e-05, "loss": 0.11, "step": 24860 }, { "epoch": 3.2319688109161793, "grad_norm": 0.2638729512691498, "learning_rate": 1.768031189083821e-05, "loss": 0.1042, "step": 24870 }, { "epoch": 3.2332683560753734, "grad_norm": 0.17908115684986115, "learning_rate": 1.7667316439246265e-05, "loss": 0.0879, "step": 24880 }, { "epoch": 3.234567901234568, "grad_norm": 0.7806366682052612, "learning_rate": 1.765432098765432e-05, "loss": 0.1061, "step": 24890 }, { "epoch": 3.2358674463937622, "grad_norm": 0.731069803237915, "learning_rate": 1.7641325536062377e-05, "loss": 0.1029, "step": 24900 }, { "epoch": 3.2371669915529564, "grad_norm": 0.3574985861778259, "learning_rate": 1.7628330084470436e-05, "loss": 0.0999, "step": 24910 }, { "epoch": 3.2384665367121506, "grad_norm": 0.5247701406478882, "learning_rate": 1.7615334632878495e-05, "loss": 0.106, "step": 24920 }, { "epoch": 3.239766081871345, "grad_norm": 0.2273469865322113, "learning_rate": 1.760233918128655e-05, "loss": 0.1543, "step": 24930 }, { "epoch": 3.2410656270305394, "grad_norm": 0.31109604239463806, "learning_rate": 1.7589343729694607e-05, "loss": 0.1019, "step": 24940 }, { "epoch": 3.2423651721897335, "grad_norm": 0.41189706325531006, "learning_rate": 1.7576348278102663e-05, "loss": 0.075, "step": 24950 }, { "epoch": 3.2436647173489277, "grad_norm": 0.9300059676170349, "learning_rate": 1.7563352826510722e-05, "loss": 0.1673, "step": 24960 }, { "epoch": 3.2449642625081223, "grad_norm": 0.22386249899864197, "learning_rate": 1.755035737491878e-05, "loss": 0.1172, "step": 24970 }, { "epoch": 3.2462638076673165, "grad_norm": 0.3889801502227783, "learning_rate": 1.7537361923326837e-05, "loss": 0.0985, "step": 24980 }, { "epoch": 3.2475633528265107, "grad_norm": 0.9983554482460022, "learning_rate": 1.7524366471734893e-05, "loss": 0.1218, "step": 24990 }, { "epoch": 3.248862897985705, "grad_norm": 0.7092593908309937, "learning_rate": 1.751137102014295e-05, "loss": 0.133, "step": 25000 }, { "epoch": 3.2501624431448994, "grad_norm": 0.4601169228553772, "learning_rate": 1.7498375568551008e-05, "loss": 0.0807, "step": 25010 }, { "epoch": 3.2514619883040936, "grad_norm": 0.6518809199333191, "learning_rate": 1.7485380116959067e-05, "loss": 0.1285, "step": 25020 }, { "epoch": 3.252761533463288, "grad_norm": 0.6817179322242737, "learning_rate": 1.7472384665367123e-05, "loss": 0.1454, "step": 25030 }, { "epoch": 3.254061078622482, "grad_norm": 0.27686607837677, "learning_rate": 1.745938921377518e-05, "loss": 0.1044, "step": 25040 }, { "epoch": 3.2553606237816766, "grad_norm": 0.4894848167896271, "learning_rate": 1.7446393762183235e-05, "loss": 0.1418, "step": 25050 }, { "epoch": 3.2566601689408707, "grad_norm": 0.35414624214172363, "learning_rate": 1.7433398310591294e-05, "loss": 0.1908, "step": 25060 }, { "epoch": 3.257959714100065, "grad_norm": 0.2177383154630661, "learning_rate": 1.7420402858999353e-05, "loss": 0.0925, "step": 25070 }, { "epoch": 3.259259259259259, "grad_norm": 0.6438485980033875, "learning_rate": 1.740740740740741e-05, "loss": 0.1469, "step": 25080 }, { "epoch": 3.2605588044184537, "grad_norm": 0.4870336353778839, "learning_rate": 1.7394411955815465e-05, "loss": 0.1024, "step": 25090 }, { "epoch": 3.261858349577648, "grad_norm": 0.6195259094238281, "learning_rate": 1.738141650422352e-05, "loss": 0.1624, "step": 25100 }, { "epoch": 3.263157894736842, "grad_norm": 0.4394708573818207, "learning_rate": 1.736842105263158e-05, "loss": 0.1134, "step": 25110 }, { "epoch": 3.264457439896036, "grad_norm": 0.5055453777313232, "learning_rate": 1.735542560103964e-05, "loss": 0.1425, "step": 25120 }, { "epoch": 3.265756985055231, "grad_norm": 0.920200526714325, "learning_rate": 1.7342430149447695e-05, "loss": 0.1766, "step": 25130 }, { "epoch": 3.267056530214425, "grad_norm": 0.6686515212059021, "learning_rate": 1.732943469785575e-05, "loss": 0.1223, "step": 25140 }, { "epoch": 3.268356075373619, "grad_norm": 0.44684237241744995, "learning_rate": 1.7316439246263807e-05, "loss": 0.1359, "step": 25150 }, { "epoch": 3.2696556205328133, "grad_norm": 0.39347711205482483, "learning_rate": 1.7303443794671866e-05, "loss": 0.1408, "step": 25160 }, { "epoch": 3.270955165692008, "grad_norm": 1.230934977531433, "learning_rate": 1.7290448343079922e-05, "loss": 0.1469, "step": 25170 }, { "epoch": 3.272254710851202, "grad_norm": 0.5124199390411377, "learning_rate": 1.727745289148798e-05, "loss": 0.0843, "step": 25180 }, { "epoch": 3.2735542560103963, "grad_norm": 0.32244187593460083, "learning_rate": 1.7264457439896037e-05, "loss": 0.1134, "step": 25190 }, { "epoch": 3.2748538011695905, "grad_norm": 0.36243340373039246, "learning_rate": 1.7251461988304093e-05, "loss": 0.0996, "step": 25200 }, { "epoch": 3.276153346328785, "grad_norm": 0.5879245400428772, "learning_rate": 1.7238466536712152e-05, "loss": 0.1381, "step": 25210 }, { "epoch": 3.2774528914879792, "grad_norm": 0.36844706535339355, "learning_rate": 1.7225471085120208e-05, "loss": 0.1202, "step": 25220 }, { "epoch": 3.2787524366471734, "grad_norm": 0.500126838684082, "learning_rate": 1.7212475633528267e-05, "loss": 0.1435, "step": 25230 }, { "epoch": 3.2800519818063676, "grad_norm": 0.6457122564315796, "learning_rate": 1.7199480181936323e-05, "loss": 0.0997, "step": 25240 }, { "epoch": 3.281351526965562, "grad_norm": 0.5197457671165466, "learning_rate": 1.718648473034438e-05, "loss": 0.0936, "step": 25250 }, { "epoch": 3.2826510721247564, "grad_norm": 0.7146595120429993, "learning_rate": 1.7173489278752438e-05, "loss": 0.1733, "step": 25260 }, { "epoch": 3.2839506172839505, "grad_norm": 0.7328951954841614, "learning_rate": 1.7160493827160494e-05, "loss": 0.1591, "step": 25270 }, { "epoch": 3.2852501624431447, "grad_norm": 0.9857842326164246, "learning_rate": 1.7147498375568553e-05, "loss": 0.1611, "step": 25280 }, { "epoch": 3.2865497076023393, "grad_norm": 0.430507630109787, "learning_rate": 1.713450292397661e-05, "loss": 0.1019, "step": 25290 }, { "epoch": 3.2878492527615335, "grad_norm": 0.48221907019615173, "learning_rate": 1.7121507472384665e-05, "loss": 0.1786, "step": 25300 }, { "epoch": 3.2891487979207277, "grad_norm": 0.23685118556022644, "learning_rate": 1.7108512020792724e-05, "loss": 0.1294, "step": 25310 }, { "epoch": 3.290448343079922, "grad_norm": 0.21667973697185516, "learning_rate": 1.709551656920078e-05, "loss": 0.1396, "step": 25320 }, { "epoch": 3.2917478882391165, "grad_norm": 0.33160626888275146, "learning_rate": 1.708252111760884e-05, "loss": 0.135, "step": 25330 }, { "epoch": 3.2930474333983106, "grad_norm": 0.3070947825908661, "learning_rate": 1.7069525666016895e-05, "loss": 0.0876, "step": 25340 }, { "epoch": 3.294346978557505, "grad_norm": 0.32584071159362793, "learning_rate": 1.705653021442495e-05, "loss": 0.1348, "step": 25350 }, { "epoch": 3.295646523716699, "grad_norm": 0.31310054659843445, "learning_rate": 1.704353476283301e-05, "loss": 0.1092, "step": 25360 }, { "epoch": 3.2969460688758936, "grad_norm": 0.2480722814798355, "learning_rate": 1.7030539311241066e-05, "loss": 0.1357, "step": 25370 }, { "epoch": 3.2982456140350878, "grad_norm": 0.32178372144699097, "learning_rate": 1.7017543859649125e-05, "loss": 0.1574, "step": 25380 }, { "epoch": 3.299545159194282, "grad_norm": 0.7686633467674255, "learning_rate": 1.700454840805718e-05, "loss": 0.1205, "step": 25390 }, { "epoch": 3.300844704353476, "grad_norm": 0.7287821173667908, "learning_rate": 1.6991552956465237e-05, "loss": 0.1381, "step": 25400 }, { "epoch": 3.3021442495126707, "grad_norm": 0.1880289763212204, "learning_rate": 1.6978557504873296e-05, "loss": 0.109, "step": 25410 }, { "epoch": 3.303443794671865, "grad_norm": 0.7714895606040955, "learning_rate": 1.6965562053281352e-05, "loss": 0.1415, "step": 25420 }, { "epoch": 3.304743339831059, "grad_norm": 0.40846240520477295, "learning_rate": 1.6952566601689408e-05, "loss": 0.0876, "step": 25430 }, { "epoch": 3.3060428849902532, "grad_norm": 0.4281584620475769, "learning_rate": 1.6939571150097467e-05, "loss": 0.1203, "step": 25440 }, { "epoch": 3.307342430149448, "grad_norm": 0.3917231857776642, "learning_rate": 1.6926575698505523e-05, "loss": 0.0875, "step": 25450 }, { "epoch": 3.308641975308642, "grad_norm": 0.7948154807090759, "learning_rate": 1.6913580246913582e-05, "loss": 0.1531, "step": 25460 }, { "epoch": 3.309941520467836, "grad_norm": 0.5684565305709839, "learning_rate": 1.690058479532164e-05, "loss": 0.1165, "step": 25470 }, { "epoch": 3.3112410656270304, "grad_norm": 0.6903478503227234, "learning_rate": 1.6887589343729694e-05, "loss": 0.1561, "step": 25480 }, { "epoch": 3.312540610786225, "grad_norm": 0.577892005443573, "learning_rate": 1.6874593892137753e-05, "loss": 0.1157, "step": 25490 }, { "epoch": 3.313840155945419, "grad_norm": 0.3905918598175049, "learning_rate": 1.686159844054581e-05, "loss": 0.1503, "step": 25500 }, { "epoch": 3.3151397011046133, "grad_norm": 0.2669404149055481, "learning_rate": 1.684860298895387e-05, "loss": 0.0967, "step": 25510 }, { "epoch": 3.3164392462638075, "grad_norm": 1.1652873754501343, "learning_rate": 1.6835607537361924e-05, "loss": 0.1456, "step": 25520 }, { "epoch": 3.317738791423002, "grad_norm": 0.18323050439357758, "learning_rate": 1.682261208576998e-05, "loss": 0.1248, "step": 25530 }, { "epoch": 3.3190383365821963, "grad_norm": 0.3327161967754364, "learning_rate": 1.680961663417804e-05, "loss": 0.1119, "step": 25540 }, { "epoch": 3.3203378817413904, "grad_norm": 0.6245760917663574, "learning_rate": 1.6796621182586095e-05, "loss": 0.1315, "step": 25550 }, { "epoch": 3.3216374269005846, "grad_norm": 0.9771631360054016, "learning_rate": 1.6783625730994155e-05, "loss": 0.1169, "step": 25560 }, { "epoch": 3.3229369720597792, "grad_norm": 0.19558075070381165, "learning_rate": 1.677063027940221e-05, "loss": 0.1092, "step": 25570 }, { "epoch": 3.3242365172189734, "grad_norm": 0.12322435528039932, "learning_rate": 1.6757634827810266e-05, "loss": 0.1222, "step": 25580 }, { "epoch": 3.3255360623781676, "grad_norm": 0.9688445925712585, "learning_rate": 1.6744639376218326e-05, "loss": 0.1717, "step": 25590 }, { "epoch": 3.3268356075373617, "grad_norm": 0.5606496930122375, "learning_rate": 1.673164392462638e-05, "loss": 0.1722, "step": 25600 }, { "epoch": 3.3281351526965564, "grad_norm": 0.5375979542732239, "learning_rate": 1.671864847303444e-05, "loss": 0.1168, "step": 25610 }, { "epoch": 3.3294346978557505, "grad_norm": 0.36600223183631897, "learning_rate": 1.6705653021442497e-05, "loss": 0.0956, "step": 25620 }, { "epoch": 3.3307342430149447, "grad_norm": 0.39133715629577637, "learning_rate": 1.6692657569850552e-05, "loss": 0.153, "step": 25630 }, { "epoch": 3.332033788174139, "grad_norm": 1.0858362913131714, "learning_rate": 1.6679662118258608e-05, "loss": 0.1411, "step": 25640 }, { "epoch": 3.3333333333333335, "grad_norm": 0.6908590793609619, "learning_rate": 1.6666666666666667e-05, "loss": 0.1264, "step": 25650 }, { "epoch": 3.3346328784925277, "grad_norm": 0.5211133360862732, "learning_rate": 1.6653671215074727e-05, "loss": 0.1571, "step": 25660 }, { "epoch": 3.335932423651722, "grad_norm": 0.6561243534088135, "learning_rate": 1.6640675763482783e-05, "loss": 0.1042, "step": 25670 }, { "epoch": 3.337231968810916, "grad_norm": 0.24133160710334778, "learning_rate": 1.662768031189084e-05, "loss": 0.0942, "step": 25680 }, { "epoch": 3.3385315139701106, "grad_norm": 0.44651123881340027, "learning_rate": 1.6614684860298894e-05, "loss": 0.1424, "step": 25690 }, { "epoch": 3.339831059129305, "grad_norm": 0.5692631602287292, "learning_rate": 1.6601689408706954e-05, "loss": 0.1086, "step": 25700 }, { "epoch": 3.341130604288499, "grad_norm": 0.8550496697425842, "learning_rate": 1.6588693957115013e-05, "loss": 0.1874, "step": 25710 }, { "epoch": 3.342430149447693, "grad_norm": 0.3080807030200958, "learning_rate": 1.657569850552307e-05, "loss": 0.1095, "step": 25720 }, { "epoch": 3.3437296946068877, "grad_norm": 1.5193169116973877, "learning_rate": 1.6562703053931124e-05, "loss": 0.2238, "step": 25730 }, { "epoch": 3.345029239766082, "grad_norm": 1.2858705520629883, "learning_rate": 1.654970760233918e-05, "loss": 0.1163, "step": 25740 }, { "epoch": 3.346328784925276, "grad_norm": 0.5886190533638, "learning_rate": 1.653671215074724e-05, "loss": 0.0919, "step": 25750 }, { "epoch": 3.3476283300844702, "grad_norm": 0.3259219825267792, "learning_rate": 1.65237166991553e-05, "loss": 0.166, "step": 25760 }, { "epoch": 3.348927875243665, "grad_norm": 0.4113786518573761, "learning_rate": 1.6510721247563355e-05, "loss": 0.0905, "step": 25770 }, { "epoch": 3.350227420402859, "grad_norm": 0.696774959564209, "learning_rate": 1.649772579597141e-05, "loss": 0.1602, "step": 25780 }, { "epoch": 3.351526965562053, "grad_norm": 0.28489983081817627, "learning_rate": 1.6484730344379466e-05, "loss": 0.1781, "step": 25790 }, { "epoch": 3.3528265107212474, "grad_norm": 0.27670353651046753, "learning_rate": 1.6471734892787526e-05, "loss": 0.1546, "step": 25800 }, { "epoch": 3.354126055880442, "grad_norm": 0.19832652807235718, "learning_rate": 1.6458739441195585e-05, "loss": 0.1307, "step": 25810 }, { "epoch": 3.355425601039636, "grad_norm": 0.5467470288276672, "learning_rate": 1.644574398960364e-05, "loss": 0.1063, "step": 25820 }, { "epoch": 3.3567251461988303, "grad_norm": 0.29303476214408875, "learning_rate": 1.6432748538011697e-05, "loss": 0.1288, "step": 25830 }, { "epoch": 3.3580246913580245, "grad_norm": 0.7665645480155945, "learning_rate": 1.6419753086419752e-05, "loss": 0.1529, "step": 25840 }, { "epoch": 3.359324236517219, "grad_norm": 0.20739686489105225, "learning_rate": 1.640675763482781e-05, "loss": 0.1122, "step": 25850 }, { "epoch": 3.3606237816764133, "grad_norm": 0.4693886339664459, "learning_rate": 1.639376218323587e-05, "loss": 0.1119, "step": 25860 }, { "epoch": 3.3619233268356075, "grad_norm": 0.4668714702129364, "learning_rate": 1.6380766731643927e-05, "loss": 0.1072, "step": 25870 }, { "epoch": 3.3632228719948016, "grad_norm": 0.15274550020694733, "learning_rate": 1.6367771280051983e-05, "loss": 0.1211, "step": 25880 }, { "epoch": 3.3645224171539962, "grad_norm": 0.48853522539138794, "learning_rate": 1.635477582846004e-05, "loss": 0.1562, "step": 25890 }, { "epoch": 3.3658219623131904, "grad_norm": 0.2763735055923462, "learning_rate": 1.6341780376868094e-05, "loss": 0.1045, "step": 25900 }, { "epoch": 3.3671215074723846, "grad_norm": 0.7662050127983093, "learning_rate": 1.6328784925276157e-05, "loss": 0.1329, "step": 25910 }, { "epoch": 3.3684210526315788, "grad_norm": 0.23916880786418915, "learning_rate": 1.6315789473684213e-05, "loss": 0.1214, "step": 25920 }, { "epoch": 3.3697205977907734, "grad_norm": 0.7265744209289551, "learning_rate": 1.630279402209227e-05, "loss": 0.087, "step": 25930 }, { "epoch": 3.3710201429499675, "grad_norm": 0.6012350916862488, "learning_rate": 1.6289798570500325e-05, "loss": 0.0923, "step": 25940 }, { "epoch": 3.3723196881091617, "grad_norm": 0.39115840196609497, "learning_rate": 1.627680311890838e-05, "loss": 0.1908, "step": 25950 }, { "epoch": 3.373619233268356, "grad_norm": 0.5151712894439697, "learning_rate": 1.626380766731644e-05, "loss": 0.1094, "step": 25960 }, { "epoch": 3.3749187784275505, "grad_norm": 0.5812552571296692, "learning_rate": 1.62508122157245e-05, "loss": 0.1033, "step": 25970 }, { "epoch": 3.3762183235867447, "grad_norm": 0.13686592876911163, "learning_rate": 1.6237816764132555e-05, "loss": 0.1501, "step": 25980 }, { "epoch": 3.377517868745939, "grad_norm": 0.36698848009109497, "learning_rate": 1.622482131254061e-05, "loss": 0.1589, "step": 25990 }, { "epoch": 3.378817413905133, "grad_norm": 1.321040391921997, "learning_rate": 1.6211825860948666e-05, "loss": 0.1104, "step": 26000 }, { "epoch": 3.3801169590643276, "grad_norm": 0.12876927852630615, "learning_rate": 1.6198830409356726e-05, "loss": 0.161, "step": 26010 }, { "epoch": 3.381416504223522, "grad_norm": 0.9689125418663025, "learning_rate": 1.6185834957764785e-05, "loss": 0.1315, "step": 26020 }, { "epoch": 3.382716049382716, "grad_norm": 0.36027300357818604, "learning_rate": 1.617283950617284e-05, "loss": 0.1722, "step": 26030 }, { "epoch": 3.38401559454191, "grad_norm": 0.3386448323726654, "learning_rate": 1.6159844054580897e-05, "loss": 0.1601, "step": 26040 }, { "epoch": 3.3853151397011048, "grad_norm": 0.38961562514305115, "learning_rate": 1.6146848602988953e-05, "loss": 0.1114, "step": 26050 }, { "epoch": 3.386614684860299, "grad_norm": 0.7539747357368469, "learning_rate": 1.6133853151397012e-05, "loss": 0.0904, "step": 26060 }, { "epoch": 3.387914230019493, "grad_norm": 0.5260695219039917, "learning_rate": 1.612085769980507e-05, "loss": 0.0834, "step": 26070 }, { "epoch": 3.3892137751786873, "grad_norm": 1.0742740631103516, "learning_rate": 1.6107862248213127e-05, "loss": 0.1305, "step": 26080 }, { "epoch": 3.390513320337882, "grad_norm": 0.48439937829971313, "learning_rate": 1.6094866796621183e-05, "loss": 0.0929, "step": 26090 }, { "epoch": 3.391812865497076, "grad_norm": 0.746205747127533, "learning_rate": 1.608187134502924e-05, "loss": 0.1262, "step": 26100 }, { "epoch": 3.3931124106562702, "grad_norm": 0.4529440402984619, "learning_rate": 1.6068875893437298e-05, "loss": 0.1119, "step": 26110 }, { "epoch": 3.3944119558154644, "grad_norm": 0.5629494190216064, "learning_rate": 1.6055880441845357e-05, "loss": 0.146, "step": 26120 }, { "epoch": 3.395711500974659, "grad_norm": 0.8960067629814148, "learning_rate": 1.6042884990253413e-05, "loss": 0.0983, "step": 26130 }, { "epoch": 3.397011046133853, "grad_norm": 0.5869749784469604, "learning_rate": 1.602988953866147e-05, "loss": 0.1059, "step": 26140 }, { "epoch": 3.3983105912930474, "grad_norm": 0.3183312714099884, "learning_rate": 1.6016894087069525e-05, "loss": 0.116, "step": 26150 }, { "epoch": 3.3996101364522415, "grad_norm": 0.34891024231910706, "learning_rate": 1.6003898635477584e-05, "loss": 0.0999, "step": 26160 }, { "epoch": 3.400909681611436, "grad_norm": 0.14670953154563904, "learning_rate": 1.5990903183885643e-05, "loss": 0.109, "step": 26170 }, { "epoch": 3.4022092267706303, "grad_norm": 1.019511103630066, "learning_rate": 1.59779077322937e-05, "loss": 0.1248, "step": 26180 }, { "epoch": 3.4035087719298245, "grad_norm": 1.2760573625564575, "learning_rate": 1.5964912280701755e-05, "loss": 0.134, "step": 26190 }, { "epoch": 3.4048083170890187, "grad_norm": 0.3244759738445282, "learning_rate": 1.595191682910981e-05, "loss": 0.1483, "step": 26200 }, { "epoch": 3.4061078622482133, "grad_norm": 0.46584123373031616, "learning_rate": 1.593892137751787e-05, "loss": 0.1074, "step": 26210 }, { "epoch": 3.4074074074074074, "grad_norm": 0.2507392466068268, "learning_rate": 1.5925925925925926e-05, "loss": 0.1489, "step": 26220 }, { "epoch": 3.4087069525666016, "grad_norm": 0.34980320930480957, "learning_rate": 1.5912930474333985e-05, "loss": 0.098, "step": 26230 }, { "epoch": 3.410006497725796, "grad_norm": 0.41219261288642883, "learning_rate": 1.589993502274204e-05, "loss": 0.1344, "step": 26240 }, { "epoch": 3.4113060428849904, "grad_norm": 1.0026822090148926, "learning_rate": 1.5886939571150097e-05, "loss": 0.1677, "step": 26250 }, { "epoch": 3.4126055880441846, "grad_norm": 0.9816569685935974, "learning_rate": 1.5873944119558156e-05, "loss": 0.174, "step": 26260 }, { "epoch": 3.4139051332033787, "grad_norm": 0.7490084171295166, "learning_rate": 1.5860948667966212e-05, "loss": 0.1784, "step": 26270 }, { "epoch": 3.415204678362573, "grad_norm": 1.8356963396072388, "learning_rate": 1.584795321637427e-05, "loss": 0.1678, "step": 26280 }, { "epoch": 3.4165042235217675, "grad_norm": 0.23773130774497986, "learning_rate": 1.5834957764782327e-05, "loss": 0.1176, "step": 26290 }, { "epoch": 3.4178037686809617, "grad_norm": 0.4884783625602722, "learning_rate": 1.5821962313190383e-05, "loss": 0.1881, "step": 26300 }, { "epoch": 3.419103313840156, "grad_norm": 0.23981575667858124, "learning_rate": 1.5808966861598442e-05, "loss": 0.1746, "step": 26310 }, { "epoch": 3.42040285899935, "grad_norm": 0.1455792337656021, "learning_rate": 1.5795971410006498e-05, "loss": 0.1091, "step": 26320 }, { "epoch": 3.4217024041585447, "grad_norm": 0.9859343767166138, "learning_rate": 1.5782975958414557e-05, "loss": 0.1536, "step": 26330 }, { "epoch": 3.423001949317739, "grad_norm": 0.4218352735042572, "learning_rate": 1.5769980506822613e-05, "loss": 0.0998, "step": 26340 }, { "epoch": 3.424301494476933, "grad_norm": 0.21462267637252808, "learning_rate": 1.575698505523067e-05, "loss": 0.0935, "step": 26350 }, { "epoch": 3.425601039636127, "grad_norm": 0.44205227494239807, "learning_rate": 1.5743989603638728e-05, "loss": 0.1502, "step": 26360 }, { "epoch": 3.426900584795322, "grad_norm": 0.6815353631973267, "learning_rate": 1.5730994152046784e-05, "loss": 0.0948, "step": 26370 }, { "epoch": 3.428200129954516, "grad_norm": 0.6205016374588013, "learning_rate": 1.5717998700454843e-05, "loss": 0.14, "step": 26380 }, { "epoch": 3.42949967511371, "grad_norm": 0.7417633533477783, "learning_rate": 1.57050032488629e-05, "loss": 0.1366, "step": 26390 }, { "epoch": 3.4307992202729043, "grad_norm": 0.8256328701972961, "learning_rate": 1.5692007797270955e-05, "loss": 0.0979, "step": 26400 }, { "epoch": 3.432098765432099, "grad_norm": 0.3876696228981018, "learning_rate": 1.5679012345679014e-05, "loss": 0.1385, "step": 26410 }, { "epoch": 3.433398310591293, "grad_norm": 0.7337366342544556, "learning_rate": 1.566601689408707e-05, "loss": 0.1469, "step": 26420 }, { "epoch": 3.4346978557504872, "grad_norm": 0.3061462342739105, "learning_rate": 1.565302144249513e-05, "loss": 0.113, "step": 26430 }, { "epoch": 3.4359974009096814, "grad_norm": 0.29021182656288147, "learning_rate": 1.5640025990903185e-05, "loss": 0.1306, "step": 26440 }, { "epoch": 3.437296946068876, "grad_norm": 0.35239171981811523, "learning_rate": 1.562703053931124e-05, "loss": 0.1051, "step": 26450 }, { "epoch": 3.43859649122807, "grad_norm": 0.24735617637634277, "learning_rate": 1.56140350877193e-05, "loss": 0.1128, "step": 26460 }, { "epoch": 3.4398960363872644, "grad_norm": 0.30267760157585144, "learning_rate": 1.5601039636127356e-05, "loss": 0.0957, "step": 26470 }, { "epoch": 3.4411955815464585, "grad_norm": 2.2564847469329834, "learning_rate": 1.5588044184535412e-05, "loss": 0.164, "step": 26480 }, { "epoch": 3.442495126705653, "grad_norm": 1.16069757938385, "learning_rate": 1.557504873294347e-05, "loss": 0.1052, "step": 26490 }, { "epoch": 3.4437946718648473, "grad_norm": 0.2518121898174286, "learning_rate": 1.5562053281351527e-05, "loss": 0.1263, "step": 26500 }, { "epoch": 3.4450942170240415, "grad_norm": 0.39050227403640747, "learning_rate": 1.5549057829759586e-05, "loss": 0.1183, "step": 26510 }, { "epoch": 3.4463937621832357, "grad_norm": 1.0378992557525635, "learning_rate": 1.5536062378167642e-05, "loss": 0.0846, "step": 26520 }, { "epoch": 3.4476933073424303, "grad_norm": 0.4426374137401581, "learning_rate": 1.5523066926575698e-05, "loss": 0.1201, "step": 26530 }, { "epoch": 3.4489928525016245, "grad_norm": 0.32138967514038086, "learning_rate": 1.5510071474983757e-05, "loss": 0.1159, "step": 26540 }, { "epoch": 3.4502923976608186, "grad_norm": 0.15705478191375732, "learning_rate": 1.5497076023391813e-05, "loss": 0.106, "step": 26550 }, { "epoch": 3.451591942820013, "grad_norm": 0.268147736787796, "learning_rate": 1.5484080571799872e-05, "loss": 0.0956, "step": 26560 }, { "epoch": 3.4528914879792074, "grad_norm": 0.2750931978225708, "learning_rate": 1.5471085120207928e-05, "loss": 0.1201, "step": 26570 }, { "epoch": 3.4541910331384016, "grad_norm": 0.1236884668469429, "learning_rate": 1.5458089668615984e-05, "loss": 0.1034, "step": 26580 }, { "epoch": 3.4554905782975958, "grad_norm": 0.6634644269943237, "learning_rate": 1.5445094217024043e-05, "loss": 0.1279, "step": 26590 }, { "epoch": 3.45679012345679, "grad_norm": 1.081878423690796, "learning_rate": 1.54320987654321e-05, "loss": 0.1379, "step": 26600 }, { "epoch": 3.4580896686159845, "grad_norm": 0.3537823259830475, "learning_rate": 1.541910331384016e-05, "loss": 0.1234, "step": 26610 }, { "epoch": 3.4593892137751787, "grad_norm": 0.7634983658790588, "learning_rate": 1.5406107862248214e-05, "loss": 0.1386, "step": 26620 }, { "epoch": 3.460688758934373, "grad_norm": 0.3638189435005188, "learning_rate": 1.539311241065627e-05, "loss": 0.1095, "step": 26630 }, { "epoch": 3.461988304093567, "grad_norm": 0.6426546573638916, "learning_rate": 1.538011695906433e-05, "loss": 0.08, "step": 26640 }, { "epoch": 3.4632878492527617, "grad_norm": 0.583743155002594, "learning_rate": 1.5367121507472385e-05, "loss": 0.1157, "step": 26650 }, { "epoch": 3.464587394411956, "grad_norm": 0.6457086801528931, "learning_rate": 1.5354126055880444e-05, "loss": 0.1202, "step": 26660 }, { "epoch": 3.46588693957115, "grad_norm": 0.6046968102455139, "learning_rate": 1.53411306042885e-05, "loss": 0.1924, "step": 26670 }, { "epoch": 3.467186484730344, "grad_norm": 0.33586564660072327, "learning_rate": 1.5328135152696556e-05, "loss": 0.1524, "step": 26680 }, { "epoch": 3.468486029889539, "grad_norm": 0.6159924268722534, "learning_rate": 1.5315139701104615e-05, "loss": 0.1717, "step": 26690 }, { "epoch": 3.469785575048733, "grad_norm": 0.21540454030036926, "learning_rate": 1.530214424951267e-05, "loss": 0.094, "step": 26700 }, { "epoch": 3.471085120207927, "grad_norm": 0.9118235111236572, "learning_rate": 1.528914879792073e-05, "loss": 0.1172, "step": 26710 }, { "epoch": 3.4723846653671213, "grad_norm": 0.3060864508152008, "learning_rate": 1.5276153346328786e-05, "loss": 0.1448, "step": 26720 }, { "epoch": 3.473684210526316, "grad_norm": 0.36489298939704895, "learning_rate": 1.5263157894736842e-05, "loss": 0.1499, "step": 26730 }, { "epoch": 3.47498375568551, "grad_norm": 0.5135205984115601, "learning_rate": 1.52501624431449e-05, "loss": 0.0684, "step": 26740 }, { "epoch": 3.4762833008447043, "grad_norm": 1.0268155336380005, "learning_rate": 1.5237166991552956e-05, "loss": 0.1646, "step": 26750 }, { "epoch": 3.4775828460038984, "grad_norm": 0.2608420252799988, "learning_rate": 1.5224171539961017e-05, "loss": 0.089, "step": 26760 }, { "epoch": 3.478882391163093, "grad_norm": 0.13178616762161255, "learning_rate": 1.5211176088369072e-05, "loss": 0.0907, "step": 26770 }, { "epoch": 3.4801819363222872, "grad_norm": 0.7355179190635681, "learning_rate": 1.5198180636777128e-05, "loss": 0.0954, "step": 26780 }, { "epoch": 3.4814814814814814, "grad_norm": 0.9709289073944092, "learning_rate": 1.5185185185185186e-05, "loss": 0.145, "step": 26790 }, { "epoch": 3.4827810266406756, "grad_norm": 0.8905351758003235, "learning_rate": 1.5172189733593242e-05, "loss": 0.0888, "step": 26800 }, { "epoch": 3.48408057179987, "grad_norm": 1.0302813053131104, "learning_rate": 1.5159194282001301e-05, "loss": 0.107, "step": 26810 }, { "epoch": 3.4853801169590644, "grad_norm": 0.3675960898399353, "learning_rate": 1.5146198830409358e-05, "loss": 0.117, "step": 26820 }, { "epoch": 3.4866796621182585, "grad_norm": 0.374348908662796, "learning_rate": 1.5133203378817414e-05, "loss": 0.1162, "step": 26830 }, { "epoch": 3.4879792072774527, "grad_norm": 0.7115879654884338, "learning_rate": 1.5120207927225472e-05, "loss": 0.2073, "step": 26840 }, { "epoch": 3.4892787524366473, "grad_norm": 1.559086799621582, "learning_rate": 1.5107212475633528e-05, "loss": 0.1527, "step": 26850 }, { "epoch": 3.4905782975958415, "grad_norm": 0.3336997330188751, "learning_rate": 1.5094217024041587e-05, "loss": 0.1234, "step": 26860 }, { "epoch": 3.4918778427550357, "grad_norm": 1.283095121383667, "learning_rate": 1.5081221572449645e-05, "loss": 0.0991, "step": 26870 }, { "epoch": 3.49317738791423, "grad_norm": 0.5341092348098755, "learning_rate": 1.50682261208577e-05, "loss": 0.147, "step": 26880 }, { "epoch": 3.4944769330734244, "grad_norm": 0.7064443826675415, "learning_rate": 1.5055230669265758e-05, "loss": 0.1795, "step": 26890 }, { "epoch": 3.4957764782326186, "grad_norm": 0.4411747455596924, "learning_rate": 1.5042235217673814e-05, "loss": 0.0829, "step": 26900 }, { "epoch": 3.497076023391813, "grad_norm": 0.47745051980018616, "learning_rate": 1.5029239766081873e-05, "loss": 0.1092, "step": 26910 }, { "epoch": 3.498375568551007, "grad_norm": 0.36952632665634155, "learning_rate": 1.501624431448993e-05, "loss": 0.1655, "step": 26920 }, { "epoch": 3.4996751137102016, "grad_norm": 1.634520173072815, "learning_rate": 1.5003248862897986e-05, "loss": 0.1666, "step": 26930 }, { "epoch": 3.5009746588693957, "grad_norm": 0.294060617685318, "learning_rate": 1.4990253411306044e-05, "loss": 0.0978, "step": 26940 }, { "epoch": 3.50227420402859, "grad_norm": 0.4247889816761017, "learning_rate": 1.49772579597141e-05, "loss": 0.1415, "step": 26950 }, { "epoch": 3.503573749187784, "grad_norm": 0.768470048904419, "learning_rate": 1.4964262508122157e-05, "loss": 0.1657, "step": 26960 }, { "epoch": 3.5048732943469787, "grad_norm": 0.4522714912891388, "learning_rate": 1.4951267056530217e-05, "loss": 0.1046, "step": 26970 }, { "epoch": 3.506172839506173, "grad_norm": 0.5216553211212158, "learning_rate": 1.4938271604938272e-05, "loss": 0.1109, "step": 26980 }, { "epoch": 3.507472384665367, "grad_norm": 0.29997122287750244, "learning_rate": 1.492527615334633e-05, "loss": 0.0942, "step": 26990 }, { "epoch": 3.5087719298245617, "grad_norm": 0.3573257327079773, "learning_rate": 1.4912280701754386e-05, "loss": 0.1208, "step": 27000 }, { "epoch": 3.510071474983756, "grad_norm": 0.19515781104564667, "learning_rate": 1.4899285250162442e-05, "loss": 0.1078, "step": 27010 }, { "epoch": 3.51137102014295, "grad_norm": 0.5854423642158508, "learning_rate": 1.4886289798570503e-05, "loss": 0.1548, "step": 27020 }, { "epoch": 3.512670565302144, "grad_norm": 0.3024062216281891, "learning_rate": 1.4873294346978559e-05, "loss": 0.1146, "step": 27030 }, { "epoch": 3.5139701104613383, "grad_norm": 0.21031694114208221, "learning_rate": 1.4860298895386614e-05, "loss": 0.1143, "step": 27040 }, { "epoch": 3.515269655620533, "grad_norm": 0.9056107997894287, "learning_rate": 1.4847303443794672e-05, "loss": 0.1108, "step": 27050 }, { "epoch": 3.516569200779727, "grad_norm": 0.13189654052257538, "learning_rate": 1.4834307992202728e-05, "loss": 0.151, "step": 27060 }, { "epoch": 3.5178687459389213, "grad_norm": 0.41449078917503357, "learning_rate": 1.4821312540610787e-05, "loss": 0.0793, "step": 27070 }, { "epoch": 3.519168291098116, "grad_norm": 0.37542450428009033, "learning_rate": 1.4808317089018845e-05, "loss": 0.0788, "step": 27080 }, { "epoch": 3.52046783625731, "grad_norm": 0.9598837494850159, "learning_rate": 1.47953216374269e-05, "loss": 0.1141, "step": 27090 }, { "epoch": 3.5217673814165043, "grad_norm": 0.4605017304420471, "learning_rate": 1.4782326185834958e-05, "loss": 0.091, "step": 27100 }, { "epoch": 3.5230669265756984, "grad_norm": 0.407951682806015, "learning_rate": 1.4769330734243014e-05, "loss": 0.1378, "step": 27110 }, { "epoch": 3.5243664717348926, "grad_norm": 0.46513041853904724, "learning_rate": 1.4756335282651073e-05, "loss": 0.0966, "step": 27120 }, { "epoch": 3.525666016894087, "grad_norm": 0.6855693459510803, "learning_rate": 1.474333983105913e-05, "loss": 0.1292, "step": 27130 }, { "epoch": 3.5269655620532814, "grad_norm": 0.6220144033432007, "learning_rate": 1.4730344379467186e-05, "loss": 0.1342, "step": 27140 }, { "epoch": 3.5282651072124755, "grad_norm": 0.2624132037162781, "learning_rate": 1.4717348927875244e-05, "loss": 0.109, "step": 27150 }, { "epoch": 3.52956465237167, "grad_norm": 0.3571415841579437, "learning_rate": 1.47043534762833e-05, "loss": 0.0794, "step": 27160 }, { "epoch": 3.5308641975308643, "grad_norm": 0.3916665315628052, "learning_rate": 1.4691358024691359e-05, "loss": 0.2195, "step": 27170 }, { "epoch": 3.5321637426900585, "grad_norm": 0.40554800629615784, "learning_rate": 1.4678362573099417e-05, "loss": 0.1687, "step": 27180 }, { "epoch": 3.5334632878492527, "grad_norm": 0.193308025598526, "learning_rate": 1.4665367121507473e-05, "loss": 0.1823, "step": 27190 }, { "epoch": 3.534762833008447, "grad_norm": 0.32685258984565735, "learning_rate": 1.465237166991553e-05, "loss": 0.1122, "step": 27200 }, { "epoch": 3.5360623781676415, "grad_norm": 0.36674168705940247, "learning_rate": 1.4639376218323586e-05, "loss": 0.1464, "step": 27210 }, { "epoch": 3.5373619233268356, "grad_norm": 0.6575894355773926, "learning_rate": 1.4626380766731645e-05, "loss": 0.1195, "step": 27220 }, { "epoch": 3.53866146848603, "grad_norm": 1.0105472803115845, "learning_rate": 1.4613385315139703e-05, "loss": 0.1559, "step": 27230 }, { "epoch": 3.5399610136452244, "grad_norm": 0.5628111958503723, "learning_rate": 1.4600389863547759e-05, "loss": 0.1199, "step": 27240 }, { "epoch": 3.5412605588044186, "grad_norm": 0.6324912309646606, "learning_rate": 1.4587394411955816e-05, "loss": 0.1362, "step": 27250 }, { "epoch": 3.5425601039636128, "grad_norm": 0.8117381930351257, "learning_rate": 1.4574398960363872e-05, "loss": 0.1694, "step": 27260 }, { "epoch": 3.543859649122807, "grad_norm": 0.3836987316608429, "learning_rate": 1.4561403508771931e-05, "loss": 0.1016, "step": 27270 }, { "epoch": 3.545159194282001, "grad_norm": 0.18053926527500153, "learning_rate": 1.4548408057179989e-05, "loss": 0.1071, "step": 27280 }, { "epoch": 3.5464587394411957, "grad_norm": 0.5377472639083862, "learning_rate": 1.4535412605588045e-05, "loss": 0.1553, "step": 27290 }, { "epoch": 3.54775828460039, "grad_norm": 0.2777753174304962, "learning_rate": 1.45224171539961e-05, "loss": 0.1559, "step": 27300 }, { "epoch": 3.549057829759584, "grad_norm": 0.34511303901672363, "learning_rate": 1.4509421702404158e-05, "loss": 0.1367, "step": 27310 }, { "epoch": 3.5503573749187787, "grad_norm": 0.7365157604217529, "learning_rate": 1.4496426250812217e-05, "loss": 0.1999, "step": 27320 }, { "epoch": 3.551656920077973, "grad_norm": 1.0869789123535156, "learning_rate": 1.4483430799220273e-05, "loss": 0.1622, "step": 27330 }, { "epoch": 3.552956465237167, "grad_norm": 0.27732008695602417, "learning_rate": 1.447043534762833e-05, "loss": 0.135, "step": 27340 }, { "epoch": 3.554256010396361, "grad_norm": 0.2626799941062927, "learning_rate": 1.4457439896036387e-05, "loss": 0.0922, "step": 27350 }, { "epoch": 3.5555555555555554, "grad_norm": 0.4188463091850281, "learning_rate": 1.4444444444444444e-05, "loss": 0.1303, "step": 27360 }, { "epoch": 3.55685510071475, "grad_norm": 0.3705924153327942, "learning_rate": 1.4431448992852503e-05, "loss": 0.1185, "step": 27370 }, { "epoch": 3.558154645873944, "grad_norm": 1.3483846187591553, "learning_rate": 1.441845354126056e-05, "loss": 0.1742, "step": 27380 }, { "epoch": 3.5594541910331383, "grad_norm": 0.48336365818977356, "learning_rate": 1.4405458089668617e-05, "loss": 0.1212, "step": 27390 }, { "epoch": 3.560753736192333, "grad_norm": 0.2568129003047943, "learning_rate": 1.4392462638076673e-05, "loss": 0.1029, "step": 27400 }, { "epoch": 3.562053281351527, "grad_norm": 0.2497936636209488, "learning_rate": 1.437946718648473e-05, "loss": 0.1332, "step": 27410 }, { "epoch": 3.5633528265107213, "grad_norm": 0.3473352789878845, "learning_rate": 1.436647173489279e-05, "loss": 0.0925, "step": 27420 }, { "epoch": 3.5646523716699154, "grad_norm": 0.695141613483429, "learning_rate": 1.4353476283300845e-05, "loss": 0.1296, "step": 27430 }, { "epoch": 3.5659519168291096, "grad_norm": 2.6575827598571777, "learning_rate": 1.4340480831708903e-05, "loss": 0.1148, "step": 27440 }, { "epoch": 3.5672514619883042, "grad_norm": 0.48556336760520935, "learning_rate": 1.4327485380116959e-05, "loss": 0.081, "step": 27450 }, { "epoch": 3.5685510071474984, "grad_norm": 0.3978970944881439, "learning_rate": 1.4314489928525016e-05, "loss": 0.1071, "step": 27460 }, { "epoch": 3.5698505523066926, "grad_norm": 1.1850694417953491, "learning_rate": 1.4301494476933075e-05, "loss": 0.161, "step": 27470 }, { "epoch": 3.571150097465887, "grad_norm": 0.7486498355865479, "learning_rate": 1.4288499025341131e-05, "loss": 0.1463, "step": 27480 }, { "epoch": 3.5724496426250814, "grad_norm": 0.629131555557251, "learning_rate": 1.4275503573749189e-05, "loss": 0.1729, "step": 27490 }, { "epoch": 3.5737491877842755, "grad_norm": 0.7682366371154785, "learning_rate": 1.4262508122157245e-05, "loss": 0.1551, "step": 27500 }, { "epoch": 3.5750487329434697, "grad_norm": 1.4636331796646118, "learning_rate": 1.4249512670565302e-05, "loss": 0.1349, "step": 27510 }, { "epoch": 3.576348278102664, "grad_norm": 0.40043237805366516, "learning_rate": 1.4236517218973362e-05, "loss": 0.1057, "step": 27520 }, { "epoch": 3.5776478232618585, "grad_norm": 1.230865240097046, "learning_rate": 1.4223521767381417e-05, "loss": 0.1491, "step": 27530 }, { "epoch": 3.5789473684210527, "grad_norm": 0.14414678514003754, "learning_rate": 1.4210526315789475e-05, "loss": 0.0831, "step": 27540 }, { "epoch": 3.580246913580247, "grad_norm": 0.3249838352203369, "learning_rate": 1.419753086419753e-05, "loss": 0.1396, "step": 27550 }, { "epoch": 3.5815464587394414, "grad_norm": 0.20244799554347992, "learning_rate": 1.4184535412605587e-05, "loss": 0.1104, "step": 27560 }, { "epoch": 3.5828460038986356, "grad_norm": 0.7135229110717773, "learning_rate": 1.4171539961013648e-05, "loss": 0.087, "step": 27570 }, { "epoch": 3.58414554905783, "grad_norm": 0.8320591449737549, "learning_rate": 1.4158544509421703e-05, "loss": 0.1247, "step": 27580 }, { "epoch": 3.585445094217024, "grad_norm": 0.2513565421104431, "learning_rate": 1.414554905782976e-05, "loss": 0.1218, "step": 27590 }, { "epoch": 3.586744639376218, "grad_norm": 0.19945336878299713, "learning_rate": 1.4132553606237817e-05, "loss": 0.1278, "step": 27600 }, { "epoch": 3.5880441845354127, "grad_norm": 0.3402836322784424, "learning_rate": 1.4119558154645873e-05, "loss": 0.1617, "step": 27610 }, { "epoch": 3.589343729694607, "grad_norm": 0.3901427984237671, "learning_rate": 1.4106562703053932e-05, "loss": 0.1036, "step": 27620 }, { "epoch": 3.590643274853801, "grad_norm": 0.6286407113075256, "learning_rate": 1.409356725146199e-05, "loss": 0.1178, "step": 27630 }, { "epoch": 3.5919428200129957, "grad_norm": 0.7049890756607056, "learning_rate": 1.4080571799870045e-05, "loss": 0.1177, "step": 27640 }, { "epoch": 3.59324236517219, "grad_norm": 0.2078087478876114, "learning_rate": 1.4067576348278103e-05, "loss": 0.1037, "step": 27650 }, { "epoch": 3.594541910331384, "grad_norm": 0.37443703413009644, "learning_rate": 1.4054580896686159e-05, "loss": 0.1361, "step": 27660 }, { "epoch": 3.595841455490578, "grad_norm": 1.1365044116973877, "learning_rate": 1.4041585445094218e-05, "loss": 0.1473, "step": 27670 }, { "epoch": 3.5971410006497724, "grad_norm": 0.20281270146369934, "learning_rate": 1.4028589993502276e-05, "loss": 0.1012, "step": 27680 }, { "epoch": 3.598440545808967, "grad_norm": 0.6756500005722046, "learning_rate": 1.4015594541910331e-05, "loss": 0.1556, "step": 27690 }, { "epoch": 3.599740090968161, "grad_norm": 0.29948481917381287, "learning_rate": 1.4002599090318389e-05, "loss": 0.124, "step": 27700 }, { "epoch": 3.6010396361273553, "grad_norm": 1.2703522443771362, "learning_rate": 1.3989603638726445e-05, "loss": 0.0897, "step": 27710 }, { "epoch": 3.60233918128655, "grad_norm": 0.37417396903038025, "learning_rate": 1.3976608187134504e-05, "loss": 0.1495, "step": 27720 }, { "epoch": 3.603638726445744, "grad_norm": 0.23137831687927246, "learning_rate": 1.3963612735542562e-05, "loss": 0.0908, "step": 27730 }, { "epoch": 3.6049382716049383, "grad_norm": 1.1781548261642456, "learning_rate": 1.3950617283950617e-05, "loss": 0.1727, "step": 27740 }, { "epoch": 3.6062378167641325, "grad_norm": 1.0463452339172363, "learning_rate": 1.3937621832358675e-05, "loss": 0.1151, "step": 27750 }, { "epoch": 3.6075373619233266, "grad_norm": 0.1831691563129425, "learning_rate": 1.3924626380766731e-05, "loss": 0.1482, "step": 27760 }, { "epoch": 3.6088369070825213, "grad_norm": 0.7948482632637024, "learning_rate": 1.391163092917479e-05, "loss": 0.1227, "step": 27770 }, { "epoch": 3.6101364522417154, "grad_norm": 0.5800575017929077, "learning_rate": 1.3898635477582848e-05, "loss": 0.1004, "step": 27780 }, { "epoch": 3.6114359974009096, "grad_norm": 0.24395297467708588, "learning_rate": 1.3885640025990904e-05, "loss": 0.0743, "step": 27790 }, { "epoch": 3.612735542560104, "grad_norm": 0.3652712404727936, "learning_rate": 1.3872644574398961e-05, "loss": 0.112, "step": 27800 }, { "epoch": 3.6140350877192984, "grad_norm": 0.4876977801322937, "learning_rate": 1.3859649122807017e-05, "loss": 0.1377, "step": 27810 }, { "epoch": 3.6153346328784925, "grad_norm": 1.1139076948165894, "learning_rate": 1.3846653671215076e-05, "loss": 0.1316, "step": 27820 }, { "epoch": 3.6166341780376867, "grad_norm": 0.5760275721549988, "learning_rate": 1.3833658219623134e-05, "loss": 0.1189, "step": 27830 }, { "epoch": 3.617933723196881, "grad_norm": 0.6012194156646729, "learning_rate": 1.382066276803119e-05, "loss": 0.1603, "step": 27840 }, { "epoch": 3.6192332683560755, "grad_norm": 0.376777321100235, "learning_rate": 1.3807667316439245e-05, "loss": 0.1173, "step": 27850 }, { "epoch": 3.6205328135152697, "grad_norm": 0.2775436341762543, "learning_rate": 1.3794671864847303e-05, "loss": 0.1082, "step": 27860 }, { "epoch": 3.621832358674464, "grad_norm": 0.6738190650939941, "learning_rate": 1.3781676413255362e-05, "loss": 0.1197, "step": 27870 }, { "epoch": 3.6231319038336585, "grad_norm": 0.7098721265792847, "learning_rate": 1.3768680961663418e-05, "loss": 0.1017, "step": 27880 }, { "epoch": 3.6244314489928526, "grad_norm": 0.5277796983718872, "learning_rate": 1.3755685510071476e-05, "loss": 0.1178, "step": 27890 }, { "epoch": 3.625730994152047, "grad_norm": 0.6924259066581726, "learning_rate": 1.3742690058479531e-05, "loss": 0.1258, "step": 27900 }, { "epoch": 3.627030539311241, "grad_norm": 0.29889845848083496, "learning_rate": 1.3729694606887589e-05, "loss": 0.091, "step": 27910 }, { "epoch": 3.628330084470435, "grad_norm": 0.3732012212276459, "learning_rate": 1.3716699155295648e-05, "loss": 0.0984, "step": 27920 }, { "epoch": 3.6296296296296298, "grad_norm": 0.7129666805267334, "learning_rate": 1.3703703703703704e-05, "loss": 0.1144, "step": 27930 }, { "epoch": 3.630929174788824, "grad_norm": 0.3980967700481415, "learning_rate": 1.3690708252111762e-05, "loss": 0.1327, "step": 27940 }, { "epoch": 3.632228719948018, "grad_norm": 0.602085292339325, "learning_rate": 1.3677712800519818e-05, "loss": 0.158, "step": 27950 }, { "epoch": 3.6335282651072127, "grad_norm": 0.43848663568496704, "learning_rate": 1.3664717348927875e-05, "loss": 0.121, "step": 27960 }, { "epoch": 3.634827810266407, "grad_norm": 0.8211021423339844, "learning_rate": 1.3651721897335934e-05, "loss": 0.1232, "step": 27970 }, { "epoch": 3.636127355425601, "grad_norm": 0.007814254611730576, "learning_rate": 1.363872644574399e-05, "loss": 0.1214, "step": 27980 }, { "epoch": 3.6374269005847952, "grad_norm": 0.8539933562278748, "learning_rate": 1.3625730994152048e-05, "loss": 0.1149, "step": 27990 }, { "epoch": 3.6387264457439894, "grad_norm": 0.5748565196990967, "learning_rate": 1.3612735542560104e-05, "loss": 0.1623, "step": 28000 }, { "epoch": 3.640025990903184, "grad_norm": 0.183331698179245, "learning_rate": 1.3599740090968161e-05, "loss": 0.0949, "step": 28010 }, { "epoch": 3.641325536062378, "grad_norm": 0.42417988181114197, "learning_rate": 1.358674463937622e-05, "loss": 0.1515, "step": 28020 }, { "epoch": 3.6426250812215724, "grad_norm": 0.14776958525180817, "learning_rate": 1.3573749187784276e-05, "loss": 0.1178, "step": 28030 }, { "epoch": 3.643924626380767, "grad_norm": 0.23126395046710968, "learning_rate": 1.3560753736192334e-05, "loss": 0.1211, "step": 28040 }, { "epoch": 3.645224171539961, "grad_norm": 1.0210869312286377, "learning_rate": 1.354775828460039e-05, "loss": 0.1415, "step": 28050 }, { "epoch": 3.6465237166991553, "grad_norm": 0.4180763363838196, "learning_rate": 1.3534762833008447e-05, "loss": 0.1389, "step": 28060 }, { "epoch": 3.6478232618583495, "grad_norm": 0.3691835403442383, "learning_rate": 1.3521767381416506e-05, "loss": 0.0788, "step": 28070 }, { "epoch": 3.6491228070175437, "grad_norm": 0.3397236764431, "learning_rate": 1.3508771929824562e-05, "loss": 0.1249, "step": 28080 }, { "epoch": 3.6504223521767383, "grad_norm": 0.2922205328941345, "learning_rate": 1.349577647823262e-05, "loss": 0.1282, "step": 28090 }, { "epoch": 3.6517218973359324, "grad_norm": 0.5863010287284851, "learning_rate": 1.3482781026640676e-05, "loss": 0.1039, "step": 28100 }, { "epoch": 3.6530214424951266, "grad_norm": 0.7283901572227478, "learning_rate": 1.3469785575048732e-05, "loss": 0.125, "step": 28110 }, { "epoch": 3.6543209876543212, "grad_norm": 0.20181629061698914, "learning_rate": 1.3456790123456793e-05, "loss": 0.1247, "step": 28120 }, { "epoch": 3.6556205328135154, "grad_norm": 0.719542384147644, "learning_rate": 1.3443794671864848e-05, "loss": 0.0924, "step": 28130 }, { "epoch": 3.6569200779727096, "grad_norm": 0.48898765444755554, "learning_rate": 1.3430799220272904e-05, "loss": 0.1148, "step": 28140 }, { "epoch": 3.6582196231319037, "grad_norm": 0.4252176880836487, "learning_rate": 1.3417803768680962e-05, "loss": 0.1359, "step": 28150 }, { "epoch": 3.659519168291098, "grad_norm": 0.41134634613990784, "learning_rate": 1.3404808317089018e-05, "loss": 0.124, "step": 28160 }, { "epoch": 3.6608187134502925, "grad_norm": 0.35484036803245544, "learning_rate": 1.3391812865497077e-05, "loss": 0.0967, "step": 28170 }, { "epoch": 3.6621182586094867, "grad_norm": 0.5343708992004395, "learning_rate": 1.3378817413905134e-05, "loss": 0.1556, "step": 28180 }, { "epoch": 3.663417803768681, "grad_norm": 0.4119149446487427, "learning_rate": 1.336582196231319e-05, "loss": 0.0914, "step": 28190 }, { "epoch": 3.6647173489278755, "grad_norm": 0.8944699168205261, "learning_rate": 1.3352826510721248e-05, "loss": 0.1419, "step": 28200 }, { "epoch": 3.6660168940870697, "grad_norm": 0.12360885739326477, "learning_rate": 1.3339831059129304e-05, "loss": 0.1024, "step": 28210 }, { "epoch": 3.667316439246264, "grad_norm": 0.3740479648113251, "learning_rate": 1.3326835607537363e-05, "loss": 0.0937, "step": 28220 }, { "epoch": 3.668615984405458, "grad_norm": 0.6703581213951111, "learning_rate": 1.331384015594542e-05, "loss": 0.1235, "step": 28230 }, { "epoch": 3.669915529564652, "grad_norm": 0.7074266076087952, "learning_rate": 1.3300844704353476e-05, "loss": 0.1229, "step": 28240 }, { "epoch": 3.671215074723847, "grad_norm": 0.3561890125274658, "learning_rate": 1.3287849252761534e-05, "loss": 0.1034, "step": 28250 }, { "epoch": 3.672514619883041, "grad_norm": 0.35760393738746643, "learning_rate": 1.327485380116959e-05, "loss": 0.0848, "step": 28260 }, { "epoch": 3.673814165042235, "grad_norm": 0.5150129795074463, "learning_rate": 1.3261858349577649e-05, "loss": 0.1039, "step": 28270 }, { "epoch": 3.6751137102014297, "grad_norm": 0.7175201773643494, "learning_rate": 1.3248862897985707e-05, "loss": 0.1227, "step": 28280 }, { "epoch": 3.676413255360624, "grad_norm": 0.6190811991691589, "learning_rate": 1.3235867446393762e-05, "loss": 0.0977, "step": 28290 }, { "epoch": 3.677712800519818, "grad_norm": 0.34577226638793945, "learning_rate": 1.322287199480182e-05, "loss": 0.094, "step": 28300 }, { "epoch": 3.6790123456790123, "grad_norm": 1.196403980255127, "learning_rate": 1.3209876543209876e-05, "loss": 0.1342, "step": 28310 }, { "epoch": 3.6803118908382064, "grad_norm": 1.1487503051757812, "learning_rate": 1.3196881091617935e-05, "loss": 0.0902, "step": 28320 }, { "epoch": 3.681611435997401, "grad_norm": 0.410552442073822, "learning_rate": 1.3183885640025993e-05, "loss": 0.1132, "step": 28330 }, { "epoch": 3.682910981156595, "grad_norm": 0.2658521831035614, "learning_rate": 1.3170890188434048e-05, "loss": 0.1274, "step": 28340 }, { "epoch": 3.6842105263157894, "grad_norm": 0.25172314047813416, "learning_rate": 1.3157894736842106e-05, "loss": 0.0912, "step": 28350 }, { "epoch": 3.685510071474984, "grad_norm": 0.22713272273540497, "learning_rate": 1.3144899285250162e-05, "loss": 0.0917, "step": 28360 }, { "epoch": 3.686809616634178, "grad_norm": 0.30172207951545715, "learning_rate": 1.3131903833658221e-05, "loss": 0.1253, "step": 28370 }, { "epoch": 3.6881091617933723, "grad_norm": 0.20189762115478516, "learning_rate": 1.3118908382066279e-05, "loss": 0.0981, "step": 28380 }, { "epoch": 3.6894087069525665, "grad_norm": 0.5046433806419373, "learning_rate": 1.3105912930474334e-05, "loss": 0.0862, "step": 28390 }, { "epoch": 3.6907082521117607, "grad_norm": 0.2034302055835724, "learning_rate": 1.309291747888239e-05, "loss": 0.1016, "step": 28400 }, { "epoch": 3.6920077972709553, "grad_norm": 1.0794312953948975, "learning_rate": 1.3079922027290448e-05, "loss": 0.1226, "step": 28410 }, { "epoch": 3.6933073424301495, "grad_norm": 0.3739759922027588, "learning_rate": 1.3066926575698507e-05, "loss": 0.1599, "step": 28420 }, { "epoch": 3.6946068875893436, "grad_norm": 0.551996111869812, "learning_rate": 1.3053931124106563e-05, "loss": 0.135, "step": 28430 }, { "epoch": 3.6959064327485383, "grad_norm": 0.34379807114601135, "learning_rate": 1.304093567251462e-05, "loss": 0.0919, "step": 28440 }, { "epoch": 3.6972059779077324, "grad_norm": 0.7844880223274231, "learning_rate": 1.3027940220922676e-05, "loss": 0.0963, "step": 28450 }, { "epoch": 3.6985055230669266, "grad_norm": 0.8433283567428589, "learning_rate": 1.3014944769330734e-05, "loss": 0.1138, "step": 28460 }, { "epoch": 3.6998050682261208, "grad_norm": 0.47169211506843567, "learning_rate": 1.3001949317738793e-05, "loss": 0.1013, "step": 28470 }, { "epoch": 3.701104613385315, "grad_norm": 0.36000990867614746, "learning_rate": 1.2988953866146849e-05, "loss": 0.1089, "step": 28480 }, { "epoch": 3.7024041585445095, "grad_norm": 0.48474952578544617, "learning_rate": 1.2975958414554907e-05, "loss": 0.1308, "step": 28490 }, { "epoch": 3.7037037037037037, "grad_norm": 0.5869434475898743, "learning_rate": 1.2962962962962962e-05, "loss": 0.1175, "step": 28500 }, { "epoch": 3.705003248862898, "grad_norm": 0.6000356674194336, "learning_rate": 1.294996751137102e-05, "loss": 0.1056, "step": 28510 }, { "epoch": 3.7063027940220925, "grad_norm": 0.42208778858184814, "learning_rate": 1.293697205977908e-05, "loss": 0.0799, "step": 28520 }, { "epoch": 3.7076023391812867, "grad_norm": 0.2111825793981552, "learning_rate": 1.2923976608187135e-05, "loss": 0.1375, "step": 28530 }, { "epoch": 3.708901884340481, "grad_norm": 0.5543553829193115, "learning_rate": 1.2910981156595193e-05, "loss": 0.1197, "step": 28540 }, { "epoch": 3.710201429499675, "grad_norm": 1.1174521446228027, "learning_rate": 1.2897985705003249e-05, "loss": 0.142, "step": 28550 }, { "epoch": 3.711500974658869, "grad_norm": 0.3617250919342041, "learning_rate": 1.2884990253411306e-05, "loss": 0.117, "step": 28560 }, { "epoch": 3.712800519818064, "grad_norm": 0.5897022485733032, "learning_rate": 1.2871994801819365e-05, "loss": 0.1411, "step": 28570 }, { "epoch": 3.714100064977258, "grad_norm": 0.7084951996803284, "learning_rate": 1.2858999350227421e-05, "loss": 0.1304, "step": 28580 }, { "epoch": 3.715399610136452, "grad_norm": 0.4945928156375885, "learning_rate": 1.2846003898635479e-05, "loss": 0.0693, "step": 28590 }, { "epoch": 3.7166991552956468, "grad_norm": 0.25206616520881653, "learning_rate": 1.2833008447043535e-05, "loss": 0.137, "step": 28600 }, { "epoch": 3.717998700454841, "grad_norm": 0.2223711758852005, "learning_rate": 1.2820012995451592e-05, "loss": 0.1373, "step": 28610 }, { "epoch": 3.719298245614035, "grad_norm": 0.29711756110191345, "learning_rate": 1.2807017543859651e-05, "loss": 0.092, "step": 28620 }, { "epoch": 3.7205977907732293, "grad_norm": 0.24079498648643494, "learning_rate": 1.2794022092267707e-05, "loss": 0.0761, "step": 28630 }, { "epoch": 3.7218973359324234, "grad_norm": 0.5554168820381165, "learning_rate": 1.2781026640675765e-05, "loss": 0.0998, "step": 28640 }, { "epoch": 3.723196881091618, "grad_norm": 0.2249438315629959, "learning_rate": 1.276803118908382e-05, "loss": 0.1686, "step": 28650 }, { "epoch": 3.7244964262508122, "grad_norm": 0.49691885709762573, "learning_rate": 1.2755035737491876e-05, "loss": 0.1256, "step": 28660 }, { "epoch": 3.7257959714100064, "grad_norm": 0.9614166021347046, "learning_rate": 1.2742040285899937e-05, "loss": 0.1554, "step": 28670 }, { "epoch": 3.727095516569201, "grad_norm": 0.7300058603286743, "learning_rate": 1.2729044834307993e-05, "loss": 0.1634, "step": 28680 }, { "epoch": 3.728395061728395, "grad_norm": 0.454207181930542, "learning_rate": 1.2716049382716049e-05, "loss": 0.1308, "step": 28690 }, { "epoch": 3.7296946068875894, "grad_norm": 0.1380700170993805, "learning_rate": 1.2703053931124107e-05, "loss": 0.1205, "step": 28700 }, { "epoch": 3.7309941520467835, "grad_norm": 0.32925382256507874, "learning_rate": 1.2690058479532163e-05, "loss": 0.1136, "step": 28710 }, { "epoch": 3.7322936972059777, "grad_norm": 0.3496936857700348, "learning_rate": 1.2677063027940222e-05, "loss": 0.1207, "step": 28720 }, { "epoch": 3.7335932423651723, "grad_norm": 1.0452567338943481, "learning_rate": 1.266406757634828e-05, "loss": 0.1378, "step": 28730 }, { "epoch": 3.7348927875243665, "grad_norm": 0.6820533275604248, "learning_rate": 1.2651072124756335e-05, "loss": 0.1055, "step": 28740 }, { "epoch": 3.7361923326835607, "grad_norm": 0.20418837666511536, "learning_rate": 1.2638076673164393e-05, "loss": 0.1446, "step": 28750 }, { "epoch": 3.7374918778427553, "grad_norm": 0.5658933520317078, "learning_rate": 1.2625081221572449e-05, "loss": 0.1818, "step": 28760 }, { "epoch": 3.7387914230019494, "grad_norm": 0.39641332626342773, "learning_rate": 1.2612085769980508e-05, "loss": 0.1468, "step": 28770 }, { "epoch": 3.7400909681611436, "grad_norm": 0.7551290988922119, "learning_rate": 1.2599090318388565e-05, "loss": 0.1922, "step": 28780 }, { "epoch": 3.741390513320338, "grad_norm": 0.2382839173078537, "learning_rate": 1.2586094866796621e-05, "loss": 0.1294, "step": 28790 }, { "epoch": 3.742690058479532, "grad_norm": 0.43582576513290405, "learning_rate": 1.2573099415204679e-05, "loss": 0.0922, "step": 28800 }, { "epoch": 3.7439896036387266, "grad_norm": 1.0046758651733398, "learning_rate": 1.2560103963612735e-05, "loss": 0.0863, "step": 28810 }, { "epoch": 3.7452891487979207, "grad_norm": 0.46438276767730713, "learning_rate": 1.2547108512020794e-05, "loss": 0.1087, "step": 28820 }, { "epoch": 3.746588693957115, "grad_norm": 0.3255796432495117, "learning_rate": 1.2534113060428851e-05, "loss": 0.0798, "step": 28830 }, { "epoch": 3.7478882391163095, "grad_norm": 0.9897168874740601, "learning_rate": 1.2521117608836907e-05, "loss": 0.1305, "step": 28840 }, { "epoch": 3.7491877842755037, "grad_norm": 0.5797418355941772, "learning_rate": 1.2508122157244965e-05, "loss": 0.1212, "step": 28850 }, { "epoch": 3.750487329434698, "grad_norm": 0.4680408835411072, "learning_rate": 1.2495126705653022e-05, "loss": 0.1299, "step": 28860 }, { "epoch": 3.751786874593892, "grad_norm": 0.8729811310768127, "learning_rate": 1.2482131254061078e-05, "loss": 0.1008, "step": 28870 }, { "epoch": 3.753086419753086, "grad_norm": 1.0278825759887695, "learning_rate": 1.2469135802469137e-05, "loss": 0.1123, "step": 28880 }, { "epoch": 3.754385964912281, "grad_norm": 0.19176314771175385, "learning_rate": 1.2456140350877193e-05, "loss": 0.096, "step": 28890 }, { "epoch": 3.755685510071475, "grad_norm": 0.516315758228302, "learning_rate": 1.2443144899285251e-05, "loss": 0.1049, "step": 28900 }, { "epoch": 3.756985055230669, "grad_norm": 0.2913684546947479, "learning_rate": 1.2430149447693308e-05, "loss": 0.1429, "step": 28910 }, { "epoch": 3.758284600389864, "grad_norm": 1.3585683107376099, "learning_rate": 1.2417153996101364e-05, "loss": 0.1331, "step": 28920 }, { "epoch": 3.759584145549058, "grad_norm": 0.1855938732624054, "learning_rate": 1.2404158544509424e-05, "loss": 0.0896, "step": 28930 }, { "epoch": 3.760883690708252, "grad_norm": 0.7372893691062927, "learning_rate": 1.239116309291748e-05, "loss": 0.1689, "step": 28940 }, { "epoch": 3.7621832358674463, "grad_norm": 0.5469597578048706, "learning_rate": 1.2378167641325535e-05, "loss": 0.1248, "step": 28950 }, { "epoch": 3.7634827810266405, "grad_norm": 0.5464847683906555, "learning_rate": 1.2365172189733595e-05, "loss": 0.0891, "step": 28960 }, { "epoch": 3.764782326185835, "grad_norm": 0.3831331133842468, "learning_rate": 1.235217673814165e-05, "loss": 0.0837, "step": 28970 }, { "epoch": 3.7660818713450293, "grad_norm": 0.6211572289466858, "learning_rate": 1.2339181286549708e-05, "loss": 0.1443, "step": 28980 }, { "epoch": 3.7673814165042234, "grad_norm": 0.6267490386962891, "learning_rate": 1.2326185834957765e-05, "loss": 0.1499, "step": 28990 }, { "epoch": 3.768680961663418, "grad_norm": 0.3738892078399658, "learning_rate": 1.2313190383365821e-05, "loss": 0.1814, "step": 29000 }, { "epoch": 3.769980506822612, "grad_norm": 0.3591698706150055, "learning_rate": 1.230019493177388e-05, "loss": 0.1337, "step": 29010 }, { "epoch": 3.7712800519818064, "grad_norm": 1.0168412923812866, "learning_rate": 1.2287199480181936e-05, "loss": 0.1166, "step": 29020 }, { "epoch": 3.7725795971410006, "grad_norm": 0.38217025995254517, "learning_rate": 1.2274204028589994e-05, "loss": 0.091, "step": 29030 }, { "epoch": 3.7738791423001947, "grad_norm": 0.5201651453971863, "learning_rate": 1.2261208576998052e-05, "loss": 0.1247, "step": 29040 }, { "epoch": 3.7751786874593893, "grad_norm": 0.6340731382369995, "learning_rate": 1.2248213125406107e-05, "loss": 0.1283, "step": 29050 }, { "epoch": 3.7764782326185835, "grad_norm": 0.2602476477622986, "learning_rate": 1.2235217673814167e-05, "loss": 0.1706, "step": 29060 }, { "epoch": 3.7777777777777777, "grad_norm": 0.6892197728157043, "learning_rate": 1.2222222222222222e-05, "loss": 0.1162, "step": 29070 }, { "epoch": 3.7790773229369723, "grad_norm": 0.4374246299266815, "learning_rate": 1.220922677063028e-05, "loss": 0.0773, "step": 29080 }, { "epoch": 3.7803768680961665, "grad_norm": 0.5986202955245972, "learning_rate": 1.2196231319038338e-05, "loss": 0.1487, "step": 29090 }, { "epoch": 3.7816764132553606, "grad_norm": 0.5334256291389465, "learning_rate": 1.2183235867446393e-05, "loss": 0.1143, "step": 29100 }, { "epoch": 3.782975958414555, "grad_norm": 1.1914550065994263, "learning_rate": 1.2170240415854451e-05, "loss": 0.1269, "step": 29110 }, { "epoch": 3.784275503573749, "grad_norm": 0.48428958654403687, "learning_rate": 1.2157244964262509e-05, "loss": 0.1474, "step": 29120 }, { "epoch": 3.7855750487329436, "grad_norm": 0.1499575972557068, "learning_rate": 1.2144249512670566e-05, "loss": 0.1239, "step": 29130 }, { "epoch": 3.7868745938921378, "grad_norm": 0.43723946809768677, "learning_rate": 1.2131254061078624e-05, "loss": 0.1229, "step": 29140 }, { "epoch": 3.788174139051332, "grad_norm": 0.9183163046836853, "learning_rate": 1.211825860948668e-05, "loss": 0.1415, "step": 29150 }, { "epoch": 3.7894736842105265, "grad_norm": 0.22757884860038757, "learning_rate": 1.2105263157894737e-05, "loss": 0.0799, "step": 29160 }, { "epoch": 3.7907732293697207, "grad_norm": 0.7640048861503601, "learning_rate": 1.2092267706302795e-05, "loss": 0.1129, "step": 29170 }, { "epoch": 3.792072774528915, "grad_norm": 0.6067577600479126, "learning_rate": 1.2079272254710852e-05, "loss": 0.1195, "step": 29180 }, { "epoch": 3.793372319688109, "grad_norm": 0.24529753625392914, "learning_rate": 1.206627680311891e-05, "loss": 0.1287, "step": 29190 }, { "epoch": 3.7946718648473032, "grad_norm": 0.8407741189002991, "learning_rate": 1.2053281351526966e-05, "loss": 0.1164, "step": 29200 }, { "epoch": 3.795971410006498, "grad_norm": 0.6182228326797485, "learning_rate": 1.2040285899935023e-05, "loss": 0.1141, "step": 29210 }, { "epoch": 3.797270955165692, "grad_norm": 0.6430752873420715, "learning_rate": 1.202729044834308e-05, "loss": 0.1278, "step": 29220 }, { "epoch": 3.798570500324886, "grad_norm": 0.38406234979629517, "learning_rate": 1.2014294996751138e-05, "loss": 0.1276, "step": 29230 }, { "epoch": 3.799870045484081, "grad_norm": 0.361672967672348, "learning_rate": 1.2001299545159194e-05, "loss": 0.1146, "step": 29240 }, { "epoch": 3.801169590643275, "grad_norm": 0.6437076330184937, "learning_rate": 1.1988304093567252e-05, "loss": 0.1026, "step": 29250 }, { "epoch": 3.802469135802469, "grad_norm": 0.1858055740594864, "learning_rate": 1.1975308641975309e-05, "loss": 0.1701, "step": 29260 }, { "epoch": 3.8037686809616633, "grad_norm": 0.6290071606636047, "learning_rate": 1.1962313190383367e-05, "loss": 0.0788, "step": 29270 }, { "epoch": 3.8050682261208575, "grad_norm": 0.3997516334056854, "learning_rate": 1.1949317738791424e-05, "loss": 0.1611, "step": 29280 }, { "epoch": 3.806367771280052, "grad_norm": 0.6703053116798401, "learning_rate": 1.193632228719948e-05, "loss": 0.0969, "step": 29290 }, { "epoch": 3.8076673164392463, "grad_norm": 2.04032564163208, "learning_rate": 1.1923326835607538e-05, "loss": 0.1686, "step": 29300 }, { "epoch": 3.8089668615984404, "grad_norm": 0.7580338716506958, "learning_rate": 1.1910331384015595e-05, "loss": 0.1141, "step": 29310 }, { "epoch": 3.810266406757635, "grad_norm": 0.5125283598899841, "learning_rate": 1.1897335932423653e-05, "loss": 0.1235, "step": 29320 }, { "epoch": 3.8115659519168292, "grad_norm": 0.37615466117858887, "learning_rate": 1.188434048083171e-05, "loss": 0.11, "step": 29330 }, { "epoch": 3.8128654970760234, "grad_norm": 0.3552311062812805, "learning_rate": 1.1871345029239766e-05, "loss": 0.127, "step": 29340 }, { "epoch": 3.8141650422352176, "grad_norm": 0.6052232384681702, "learning_rate": 1.1858349577647824e-05, "loss": 0.1432, "step": 29350 }, { "epoch": 3.8154645873944117, "grad_norm": 0.422917902469635, "learning_rate": 1.1845354126055881e-05, "loss": 0.1108, "step": 29360 }, { "epoch": 3.8167641325536064, "grad_norm": 0.5306421518325806, "learning_rate": 1.1832358674463937e-05, "loss": 0.1648, "step": 29370 }, { "epoch": 3.8180636777128005, "grad_norm": 0.31645163893699646, "learning_rate": 1.1819363222871996e-05, "loss": 0.1355, "step": 29380 }, { "epoch": 3.8193632228719947, "grad_norm": 0.1272791177034378, "learning_rate": 1.1806367771280052e-05, "loss": 0.1381, "step": 29390 }, { "epoch": 3.8206627680311893, "grad_norm": 0.3017774224281311, "learning_rate": 1.179337231968811e-05, "loss": 0.0775, "step": 29400 }, { "epoch": 3.8219623131903835, "grad_norm": 0.9524250626564026, "learning_rate": 1.1780376868096167e-05, "loss": 0.1486, "step": 29410 }, { "epoch": 3.8232618583495777, "grad_norm": 0.8093119263648987, "learning_rate": 1.1767381416504223e-05, "loss": 0.1793, "step": 29420 }, { "epoch": 3.824561403508772, "grad_norm": 0.7352808117866516, "learning_rate": 1.1754385964912282e-05, "loss": 0.156, "step": 29430 }, { "epoch": 3.825860948667966, "grad_norm": 1.149355173110962, "learning_rate": 1.1741390513320338e-05, "loss": 0.1349, "step": 29440 }, { "epoch": 3.8271604938271606, "grad_norm": 0.36587679386138916, "learning_rate": 1.1728395061728396e-05, "loss": 0.1063, "step": 29450 }, { "epoch": 3.828460038986355, "grad_norm": 0.1890907883644104, "learning_rate": 1.1715399610136453e-05, "loss": 0.0861, "step": 29460 }, { "epoch": 3.829759584145549, "grad_norm": 0.4032920300960541, "learning_rate": 1.170240415854451e-05, "loss": 0.1012, "step": 29470 }, { "epoch": 3.8310591293047436, "grad_norm": 0.5081347823143005, "learning_rate": 1.1689408706952568e-05, "loss": 0.1325, "step": 29480 }, { "epoch": 3.8323586744639377, "grad_norm": 0.5868605375289917, "learning_rate": 1.1676413255360624e-05, "loss": 0.1471, "step": 29490 }, { "epoch": 3.833658219623132, "grad_norm": 0.5119397044181824, "learning_rate": 1.166341780376868e-05, "loss": 0.1169, "step": 29500 }, { "epoch": 3.834957764782326, "grad_norm": 0.7289415597915649, "learning_rate": 1.165042235217674e-05, "loss": 0.1128, "step": 29510 }, { "epoch": 3.8362573099415203, "grad_norm": 0.5451872944831848, "learning_rate": 1.1637426900584795e-05, "loss": 0.0963, "step": 29520 }, { "epoch": 3.837556855100715, "grad_norm": 1.0582430362701416, "learning_rate": 1.1624431448992853e-05, "loss": 0.2126, "step": 29530 }, { "epoch": 3.838856400259909, "grad_norm": 0.21719320118427277, "learning_rate": 1.161143599740091e-05, "loss": 0.1021, "step": 29540 }, { "epoch": 3.840155945419103, "grad_norm": 0.4856785237789154, "learning_rate": 1.1598440545808966e-05, "loss": 0.1087, "step": 29550 }, { "epoch": 3.841455490578298, "grad_norm": 0.5485243201255798, "learning_rate": 1.1585445094217025e-05, "loss": 0.1079, "step": 29560 }, { "epoch": 3.842755035737492, "grad_norm": 0.39639315009117126, "learning_rate": 1.1572449642625081e-05, "loss": 0.1215, "step": 29570 }, { "epoch": 3.844054580896686, "grad_norm": 0.530387282371521, "learning_rate": 1.1559454191033139e-05, "loss": 0.1185, "step": 29580 }, { "epoch": 3.8453541260558803, "grad_norm": 0.25524458289146423, "learning_rate": 1.1546458739441196e-05, "loss": 0.1584, "step": 29590 }, { "epoch": 3.8466536712150745, "grad_norm": 0.6125794649124146, "learning_rate": 1.1533463287849252e-05, "loss": 0.0823, "step": 29600 }, { "epoch": 3.847953216374269, "grad_norm": 0.3210216164588928, "learning_rate": 1.1520467836257312e-05, "loss": 0.1073, "step": 29610 }, { "epoch": 3.8492527615334633, "grad_norm": 1.071460247039795, "learning_rate": 1.1507472384665367e-05, "loss": 0.0964, "step": 29620 }, { "epoch": 3.8505523066926575, "grad_norm": 0.5110921263694763, "learning_rate": 1.1494476933073425e-05, "loss": 0.1599, "step": 29630 }, { "epoch": 3.851851851851852, "grad_norm": 0.335906058549881, "learning_rate": 1.1481481481481482e-05, "loss": 0.0897, "step": 29640 }, { "epoch": 3.8531513970110463, "grad_norm": 0.9945403337478638, "learning_rate": 1.1468486029889538e-05, "loss": 0.1281, "step": 29650 }, { "epoch": 3.8544509421702404, "grad_norm": 0.6413795351982117, "learning_rate": 1.1455490578297596e-05, "loss": 0.1906, "step": 29660 }, { "epoch": 3.8557504873294346, "grad_norm": 0.2981964349746704, "learning_rate": 1.1442495126705653e-05, "loss": 0.1696, "step": 29670 }, { "epoch": 3.8570500324886288, "grad_norm": 0.40645691752433777, "learning_rate": 1.1429499675113711e-05, "loss": 0.1981, "step": 29680 }, { "epoch": 3.8583495776478234, "grad_norm": 0.21780608594417572, "learning_rate": 1.1416504223521769e-05, "loss": 0.1772, "step": 29690 }, { "epoch": 3.8596491228070176, "grad_norm": 1.6003963947296143, "learning_rate": 1.1403508771929824e-05, "loss": 0.1615, "step": 29700 }, { "epoch": 3.8609486679662117, "grad_norm": 0.8135634660720825, "learning_rate": 1.1390513320337882e-05, "loss": 0.1306, "step": 29710 }, { "epoch": 3.8622482131254063, "grad_norm": 0.7897058725357056, "learning_rate": 1.137751786874594e-05, "loss": 0.1092, "step": 29720 }, { "epoch": 3.8635477582846005, "grad_norm": 0.4340214431285858, "learning_rate": 1.1364522417153997e-05, "loss": 0.1113, "step": 29730 }, { "epoch": 3.8648473034437947, "grad_norm": 0.1921444833278656, "learning_rate": 1.1351526965562055e-05, "loss": 0.1025, "step": 29740 }, { "epoch": 3.866146848602989, "grad_norm": 0.8896617293357849, "learning_rate": 1.133853151397011e-05, "loss": 0.1426, "step": 29750 }, { "epoch": 3.867446393762183, "grad_norm": 0.22849641740322113, "learning_rate": 1.1325536062378168e-05, "loss": 0.1355, "step": 29760 }, { "epoch": 3.8687459389213776, "grad_norm": 0.7236873507499695, "learning_rate": 1.1312540610786226e-05, "loss": 0.1286, "step": 29770 }, { "epoch": 3.870045484080572, "grad_norm": 0.27736151218414307, "learning_rate": 1.1299545159194283e-05, "loss": 0.0787, "step": 29780 }, { "epoch": 3.871345029239766, "grad_norm": 0.48619523644447327, "learning_rate": 1.1286549707602339e-05, "loss": 0.1481, "step": 29790 }, { "epoch": 3.8726445743989606, "grad_norm": 0.5380812287330627, "learning_rate": 1.1273554256010397e-05, "loss": 0.1396, "step": 29800 }, { "epoch": 3.8739441195581548, "grad_norm": 0.8292491436004639, "learning_rate": 1.1260558804418454e-05, "loss": 0.1325, "step": 29810 }, { "epoch": 3.875243664717349, "grad_norm": 1.3401315212249756, "learning_rate": 1.1247563352826512e-05, "loss": 0.1786, "step": 29820 }, { "epoch": 3.876543209876543, "grad_norm": 0.7779733538627625, "learning_rate": 1.123456790123457e-05, "loss": 0.1025, "step": 29830 }, { "epoch": 3.8778427550357373, "grad_norm": 0.36202314496040344, "learning_rate": 1.1221572449642625e-05, "loss": 0.114, "step": 29840 }, { "epoch": 3.879142300194932, "grad_norm": 0.8478267788887024, "learning_rate": 1.1208576998050683e-05, "loss": 0.1399, "step": 29850 }, { "epoch": 3.880441845354126, "grad_norm": 0.3117624819278717, "learning_rate": 1.119558154645874e-05, "loss": 0.1316, "step": 29860 }, { "epoch": 3.8817413905133202, "grad_norm": 0.6152753829956055, "learning_rate": 1.1182586094866798e-05, "loss": 0.1424, "step": 29870 }, { "epoch": 3.883040935672515, "grad_norm": 0.5977213382720947, "learning_rate": 1.1169590643274855e-05, "loss": 0.1598, "step": 29880 }, { "epoch": 3.884340480831709, "grad_norm": 0.6750109195709229, "learning_rate": 1.1156595191682911e-05, "loss": 0.1439, "step": 29890 }, { "epoch": 3.885640025990903, "grad_norm": 0.36265555024147034, "learning_rate": 1.1143599740090969e-05, "loss": 0.1088, "step": 29900 }, { "epoch": 3.8869395711500974, "grad_norm": 0.5602633357048035, "learning_rate": 1.1130604288499026e-05, "loss": 0.0987, "step": 29910 }, { "epoch": 3.8882391163092915, "grad_norm": 0.41501086950302124, "learning_rate": 1.1117608836907082e-05, "loss": 0.1546, "step": 29920 }, { "epoch": 3.889538661468486, "grad_norm": 0.23508580029010773, "learning_rate": 1.1104613385315141e-05, "loss": 0.0953, "step": 29930 }, { "epoch": 3.8908382066276803, "grad_norm": 0.748666524887085, "learning_rate": 1.1091617933723197e-05, "loss": 0.1271, "step": 29940 }, { "epoch": 3.8921377517868745, "grad_norm": 1.1792446374893188, "learning_rate": 1.1078622482131255e-05, "loss": 0.1249, "step": 29950 }, { "epoch": 3.893437296946069, "grad_norm": 0.5780397057533264, "learning_rate": 1.1065627030539312e-05, "loss": 0.1022, "step": 29960 }, { "epoch": 3.8947368421052633, "grad_norm": 0.09983672946691513, "learning_rate": 1.1052631578947368e-05, "loss": 0.1624, "step": 29970 }, { "epoch": 3.8960363872644574, "grad_norm": 0.6993432641029358, "learning_rate": 1.1039636127355427e-05, "loss": 0.1288, "step": 29980 }, { "epoch": 3.8973359324236516, "grad_norm": 0.5556100606918335, "learning_rate": 1.1026640675763483e-05, "loss": 0.1198, "step": 29990 }, { "epoch": 3.898635477582846, "grad_norm": 0.6293817162513733, "learning_rate": 1.1013645224171539e-05, "loss": 0.1157, "step": 30000 }, { "epoch": 3.8999350227420404, "grad_norm": 0.7819668650627136, "learning_rate": 1.1000649772579598e-05, "loss": 0.1573, "step": 30010 }, { "epoch": 3.9012345679012346, "grad_norm": 0.21815909445285797, "learning_rate": 1.0987654320987654e-05, "loss": 0.1327, "step": 30020 }, { "epoch": 3.9025341130604287, "grad_norm": 0.23013059794902802, "learning_rate": 1.0974658869395712e-05, "loss": 0.1647, "step": 30030 }, { "epoch": 3.9038336582196234, "grad_norm": 0.3331412374973297, "learning_rate": 1.096166341780377e-05, "loss": 0.066, "step": 30040 }, { "epoch": 3.9051332033788175, "grad_norm": 0.9889025092124939, "learning_rate": 1.0948667966211825e-05, "loss": 0.1047, "step": 30050 }, { "epoch": 3.9064327485380117, "grad_norm": 0.5874791145324707, "learning_rate": 1.0935672514619884e-05, "loss": 0.1055, "step": 30060 }, { "epoch": 3.907732293697206, "grad_norm": 0.22828252613544464, "learning_rate": 1.092267706302794e-05, "loss": 0.1098, "step": 30070 }, { "epoch": 3.9090318388564, "grad_norm": 1.079086184501648, "learning_rate": 1.0909681611435998e-05, "loss": 0.1278, "step": 30080 }, { "epoch": 3.9103313840155947, "grad_norm": 0.25433775782585144, "learning_rate": 1.0896686159844055e-05, "loss": 0.1126, "step": 30090 }, { "epoch": 3.911630929174789, "grad_norm": 0.1706017255783081, "learning_rate": 1.0883690708252111e-05, "loss": 0.1018, "step": 30100 }, { "epoch": 3.912930474333983, "grad_norm": 1.0244684219360352, "learning_rate": 1.087069525666017e-05, "loss": 0.2172, "step": 30110 }, { "epoch": 3.9142300194931776, "grad_norm": 0.7407420873641968, "learning_rate": 1.0857699805068226e-05, "loss": 0.1066, "step": 30120 }, { "epoch": 3.915529564652372, "grad_norm": 0.20016346871852875, "learning_rate": 1.0844704353476284e-05, "loss": 0.1237, "step": 30130 }, { "epoch": 3.916829109811566, "grad_norm": 0.36914727091789246, "learning_rate": 1.0831708901884341e-05, "loss": 0.1222, "step": 30140 }, { "epoch": 3.91812865497076, "grad_norm": 0.668592631816864, "learning_rate": 1.0818713450292397e-05, "loss": 0.1301, "step": 30150 }, { "epoch": 3.9194282001299543, "grad_norm": 0.717862069606781, "learning_rate": 1.0805717998700455e-05, "loss": 0.0959, "step": 30160 }, { "epoch": 3.920727745289149, "grad_norm": 0.6306496858596802, "learning_rate": 1.0792722547108512e-05, "loss": 0.108, "step": 30170 }, { "epoch": 3.922027290448343, "grad_norm": 0.5356757640838623, "learning_rate": 1.077972709551657e-05, "loss": 0.1041, "step": 30180 }, { "epoch": 3.9233268356075373, "grad_norm": 0.43350717425346375, "learning_rate": 1.0766731643924627e-05, "loss": 0.141, "step": 30190 }, { "epoch": 3.924626380766732, "grad_norm": 0.606601357460022, "learning_rate": 1.0753736192332683e-05, "loss": 0.1252, "step": 30200 }, { "epoch": 3.925925925925926, "grad_norm": 0.32166752219200134, "learning_rate": 1.074074074074074e-05, "loss": 0.1148, "step": 30210 }, { "epoch": 3.92722547108512, "grad_norm": 0.5549533367156982, "learning_rate": 1.0727745289148798e-05, "loss": 0.0813, "step": 30220 }, { "epoch": 3.9285250162443144, "grad_norm": 0.24718992412090302, "learning_rate": 1.0714749837556856e-05, "loss": 0.1109, "step": 30230 }, { "epoch": 3.9298245614035086, "grad_norm": 1.4102083444595337, "learning_rate": 1.0701754385964913e-05, "loss": 0.1402, "step": 30240 }, { "epoch": 3.931124106562703, "grad_norm": 0.4978976845741272, "learning_rate": 1.068875893437297e-05, "loss": 0.0986, "step": 30250 }, { "epoch": 3.9324236517218973, "grad_norm": 0.2642197608947754, "learning_rate": 1.0675763482781027e-05, "loss": 0.1182, "step": 30260 }, { "epoch": 3.9337231968810915, "grad_norm": 0.17341965436935425, "learning_rate": 1.0662768031189084e-05, "loss": 0.1613, "step": 30270 }, { "epoch": 3.935022742040286, "grad_norm": 0.7318245768547058, "learning_rate": 1.0649772579597142e-05, "loss": 0.1077, "step": 30280 }, { "epoch": 3.9363222871994803, "grad_norm": 0.5227495431900024, "learning_rate": 1.0636777128005198e-05, "loss": 0.1035, "step": 30290 }, { "epoch": 3.9376218323586745, "grad_norm": 0.43333056569099426, "learning_rate": 1.0623781676413255e-05, "loss": 0.0592, "step": 30300 }, { "epoch": 3.9389213775178686, "grad_norm": 0.287557989358902, "learning_rate": 1.0610786224821313e-05, "loss": 0.1464, "step": 30310 }, { "epoch": 3.940220922677063, "grad_norm": 0.5653142929077148, "learning_rate": 1.059779077322937e-05, "loss": 0.1285, "step": 30320 }, { "epoch": 3.9415204678362574, "grad_norm": 0.5534887313842773, "learning_rate": 1.0584795321637428e-05, "loss": 0.1687, "step": 30330 }, { "epoch": 3.9428200129954516, "grad_norm": 0.17456598579883575, "learning_rate": 1.0571799870045484e-05, "loss": 0.1413, "step": 30340 }, { "epoch": 3.9441195581546458, "grad_norm": 0.14347036182880402, "learning_rate": 1.0558804418453541e-05, "loss": 0.1157, "step": 30350 }, { "epoch": 3.9454191033138404, "grad_norm": 0.2000730335712433, "learning_rate": 1.0545808966861599e-05, "loss": 0.1202, "step": 30360 }, { "epoch": 3.9467186484730346, "grad_norm": 0.3886091113090515, "learning_rate": 1.0532813515269657e-05, "loss": 0.1114, "step": 30370 }, { "epoch": 3.9480181936322287, "grad_norm": 0.3016844391822815, "learning_rate": 1.0519818063677714e-05, "loss": 0.1493, "step": 30380 }, { "epoch": 3.949317738791423, "grad_norm": 0.5959262847900391, "learning_rate": 1.050682261208577e-05, "loss": 0.1082, "step": 30390 }, { "epoch": 3.950617283950617, "grad_norm": 0.7112748622894287, "learning_rate": 1.0493827160493827e-05, "loss": 0.1052, "step": 30400 }, { "epoch": 3.9519168291098117, "grad_norm": 1.1604912281036377, "learning_rate": 1.0480831708901885e-05, "loss": 0.0973, "step": 30410 }, { "epoch": 3.953216374269006, "grad_norm": 0.683117687702179, "learning_rate": 1.0467836257309941e-05, "loss": 0.1421, "step": 30420 }, { "epoch": 3.9545159194282, "grad_norm": 0.39210283756256104, "learning_rate": 1.0454840805718e-05, "loss": 0.1197, "step": 30430 }, { "epoch": 3.9558154645873946, "grad_norm": 0.2984616160392761, "learning_rate": 1.0441845354126056e-05, "loss": 0.0832, "step": 30440 }, { "epoch": 3.957115009746589, "grad_norm": 0.594657838344574, "learning_rate": 1.0428849902534114e-05, "loss": 0.139, "step": 30450 }, { "epoch": 3.958414554905783, "grad_norm": 0.3789180815219879, "learning_rate": 1.0415854450942171e-05, "loss": 0.1395, "step": 30460 }, { "epoch": 3.959714100064977, "grad_norm": 0.7896630764007568, "learning_rate": 1.0402858999350227e-05, "loss": 0.1535, "step": 30470 }, { "epoch": 3.9610136452241713, "grad_norm": 0.8884479999542236, "learning_rate": 1.0389863547758286e-05, "loss": 0.1308, "step": 30480 }, { "epoch": 3.962313190383366, "grad_norm": 0.2980632185935974, "learning_rate": 1.0376868096166342e-05, "loss": 0.1244, "step": 30490 }, { "epoch": 3.96361273554256, "grad_norm": 0.5382214784622192, "learning_rate": 1.03638726445744e-05, "loss": 0.0885, "step": 30500 }, { "epoch": 3.9649122807017543, "grad_norm": 0.5435901880264282, "learning_rate": 1.0350877192982457e-05, "loss": 0.1524, "step": 30510 }, { "epoch": 3.966211825860949, "grad_norm": 0.32956910133361816, "learning_rate": 1.0337881741390513e-05, "loss": 0.1912, "step": 30520 }, { "epoch": 3.967511371020143, "grad_norm": 0.2340814471244812, "learning_rate": 1.0324886289798572e-05, "loss": 0.1176, "step": 30530 }, { "epoch": 3.9688109161793372, "grad_norm": 0.4151391088962555, "learning_rate": 1.0311890838206628e-05, "loss": 0.1111, "step": 30540 }, { "epoch": 3.9701104613385314, "grad_norm": 0.3388335704803467, "learning_rate": 1.0298895386614684e-05, "loss": 0.0849, "step": 30550 }, { "epoch": 3.9714100064977256, "grad_norm": 0.32390865683555603, "learning_rate": 1.0285899935022743e-05, "loss": 0.0778, "step": 30560 }, { "epoch": 3.97270955165692, "grad_norm": 0.500647783279419, "learning_rate": 1.0272904483430799e-05, "loss": 0.1396, "step": 30570 }, { "epoch": 3.9740090968161144, "grad_norm": 0.17914223670959473, "learning_rate": 1.0259909031838857e-05, "loss": 0.1701, "step": 30580 }, { "epoch": 3.9753086419753085, "grad_norm": 0.67775559425354, "learning_rate": 1.0246913580246914e-05, "loss": 0.154, "step": 30590 }, { "epoch": 3.976608187134503, "grad_norm": 0.48251473903656006, "learning_rate": 1.023391812865497e-05, "loss": 0.1728, "step": 30600 }, { "epoch": 3.9779077322936973, "grad_norm": 0.7939808964729309, "learning_rate": 1.022092267706303e-05, "loss": 0.1105, "step": 30610 }, { "epoch": 3.9792072774528915, "grad_norm": 0.17679911851882935, "learning_rate": 1.0207927225471085e-05, "loss": 0.1624, "step": 30620 }, { "epoch": 3.9805068226120857, "grad_norm": 0.33624035120010376, "learning_rate": 1.0194931773879143e-05, "loss": 0.1474, "step": 30630 }, { "epoch": 3.98180636777128, "grad_norm": 0.5538432002067566, "learning_rate": 1.01819363222872e-05, "loss": 0.1337, "step": 30640 }, { "epoch": 3.9831059129304744, "grad_norm": 0.3716858923435211, "learning_rate": 1.0168940870695256e-05, "loss": 0.1247, "step": 30650 }, { "epoch": 3.9844054580896686, "grad_norm": 0.30204734206199646, "learning_rate": 1.0155945419103315e-05, "loss": 0.0959, "step": 30660 }, { "epoch": 3.985705003248863, "grad_norm": 0.44603583216667175, "learning_rate": 1.0142949967511371e-05, "loss": 0.1312, "step": 30670 }, { "epoch": 3.9870045484080574, "grad_norm": 0.30316588282585144, "learning_rate": 1.0129954515919429e-05, "loss": 0.1386, "step": 30680 }, { "epoch": 3.9883040935672516, "grad_norm": 1.5458043813705444, "learning_rate": 1.0116959064327486e-05, "loss": 0.1945, "step": 30690 }, { "epoch": 3.9896036387264457, "grad_norm": 0.6774446368217468, "learning_rate": 1.0103963612735542e-05, "loss": 0.0995, "step": 30700 }, { "epoch": 3.99090318388564, "grad_norm": 0.6494578123092651, "learning_rate": 1.00909681611436e-05, "loss": 0.1018, "step": 30710 }, { "epoch": 3.992202729044834, "grad_norm": 0.2527182698249817, "learning_rate": 1.0077972709551657e-05, "loss": 0.1465, "step": 30720 }, { "epoch": 3.9935022742040287, "grad_norm": 0.41359126567840576, "learning_rate": 1.0064977257959715e-05, "loss": 0.1067, "step": 30730 }, { "epoch": 3.994801819363223, "grad_norm": 1.2030792236328125, "learning_rate": 1.0051981806367772e-05, "loss": 0.1791, "step": 30740 }, { "epoch": 3.996101364522417, "grad_norm": 0.4051867425441742, "learning_rate": 1.0038986354775828e-05, "loss": 0.0907, "step": 30750 }, { "epoch": 3.9974009096816117, "grad_norm": 0.5575466156005859, "learning_rate": 1.0025990903183886e-05, "loss": 0.1112, "step": 30760 }, { "epoch": 3.998700454840806, "grad_norm": 0.18799994885921478, "learning_rate": 1.0012995451591943e-05, "loss": 0.1, "step": 30770 }, { "epoch": 4.0, "grad_norm": 1.4077228307724, "learning_rate": 1e-05, "loss": 0.1692, "step": 30780 }, { "epoch": 4.0, "eval_loss": 0.08533798903226852, "eval_runtime": 854.8415, "eval_samples_per_second": 9.002, "eval_steps_per_second": 9.002, "step": 30780 }, { "epoch": 4.001299545159195, "grad_norm": 0.6066084504127502, "learning_rate": 9.987004548408058e-06, "loss": 0.121, "step": 30790 }, { "epoch": 4.002599090318388, "grad_norm": 0.4717155694961548, "learning_rate": 9.974009096816114e-06, "loss": 0.131, "step": 30800 }, { "epoch": 4.003898635477583, "grad_norm": 0.7376229166984558, "learning_rate": 9.961013645224172e-06, "loss": 0.131, "step": 30810 }, { "epoch": 4.005198180636777, "grad_norm": 0.27403855323791504, "learning_rate": 9.94801819363223e-06, "loss": 0.1385, "step": 30820 }, { "epoch": 4.006497725795971, "grad_norm": 0.9542315602302551, "learning_rate": 9.935022742040287e-06, "loss": 0.1252, "step": 30830 }, { "epoch": 4.007797270955166, "grad_norm": 0.8138768076896667, "learning_rate": 9.922027290448343e-06, "loss": 0.1161, "step": 30840 }, { "epoch": 4.00909681611436, "grad_norm": 0.3194228410720825, "learning_rate": 9.9090318388564e-06, "loss": 0.0909, "step": 30850 }, { "epoch": 4.010396361273554, "grad_norm": 0.49548065662384033, "learning_rate": 9.896036387264458e-06, "loss": 0.1172, "step": 30860 }, { "epoch": 4.011695906432749, "grad_norm": 0.18367226421833038, "learning_rate": 9.883040935672515e-06, "loss": 0.0571, "step": 30870 }, { "epoch": 4.012995451591943, "grad_norm": 0.5309764742851257, "learning_rate": 9.870045484080573e-06, "loss": 0.0941, "step": 30880 }, { "epoch": 4.014294996751137, "grad_norm": 0.31680649518966675, "learning_rate": 9.857050032488629e-06, "loss": 0.0866, "step": 30890 }, { "epoch": 4.015594541910332, "grad_norm": 0.6593419313430786, "learning_rate": 9.844054580896686e-06, "loss": 0.101, "step": 30900 }, { "epoch": 4.0168940870695256, "grad_norm": 0.1783370077610016, "learning_rate": 9.831059129304744e-06, "loss": 0.0964, "step": 30910 }, { "epoch": 4.01819363222872, "grad_norm": 0.45648646354675293, "learning_rate": 9.818063677712801e-06, "loss": 0.0942, "step": 30920 }, { "epoch": 4.019493177387914, "grad_norm": 0.8709624409675598, "learning_rate": 9.805068226120859e-06, "loss": 0.0807, "step": 30930 }, { "epoch": 4.0207927225471085, "grad_norm": 0.2428687959909439, "learning_rate": 9.792072774528915e-06, "loss": 0.1019, "step": 30940 }, { "epoch": 4.022092267706303, "grad_norm": 0.41604024171829224, "learning_rate": 9.779077322936972e-06, "loss": 0.0753, "step": 30950 }, { "epoch": 4.023391812865497, "grad_norm": 0.3540777564048767, "learning_rate": 9.76608187134503e-06, "loss": 0.0824, "step": 30960 }, { "epoch": 4.0246913580246915, "grad_norm": 0.32533028721809387, "learning_rate": 9.753086419753086e-06, "loss": 0.203, "step": 30970 }, { "epoch": 4.025990903183885, "grad_norm": 0.4163082242012024, "learning_rate": 9.740090968161145e-06, "loss": 0.0913, "step": 30980 }, { "epoch": 4.02729044834308, "grad_norm": 0.20237895846366882, "learning_rate": 9.727095516569201e-06, "loss": 0.1327, "step": 30990 }, { "epoch": 4.028589993502274, "grad_norm": 0.9059463739395142, "learning_rate": 9.714100064977258e-06, "loss": 0.1122, "step": 31000 }, { "epoch": 4.029889538661468, "grad_norm": 0.1753133237361908, "learning_rate": 9.701104613385316e-06, "loss": 0.0957, "step": 31010 }, { "epoch": 4.031189083820663, "grad_norm": 0.382424920797348, "learning_rate": 9.688109161793372e-06, "loss": 0.1106, "step": 31020 }, { "epoch": 4.032488628979857, "grad_norm": 0.5433745384216309, "learning_rate": 9.675113710201431e-06, "loss": 0.1048, "step": 31030 }, { "epoch": 4.033788174139051, "grad_norm": 0.3556884229183197, "learning_rate": 9.662118258609487e-06, "loss": 0.1071, "step": 31040 }, { "epoch": 4.035087719298246, "grad_norm": 0.7904723286628723, "learning_rate": 9.649122807017545e-06, "loss": 0.1217, "step": 31050 }, { "epoch": 4.03638726445744, "grad_norm": 0.7576868534088135, "learning_rate": 9.636127355425602e-06, "loss": 0.1258, "step": 31060 }, { "epoch": 4.037686809616634, "grad_norm": 0.5076293349266052, "learning_rate": 9.623131903833658e-06, "loss": 0.1395, "step": 31070 }, { "epoch": 4.038986354775829, "grad_norm": 0.5096575617790222, "learning_rate": 9.610136452241717e-06, "loss": 0.1099, "step": 31080 }, { "epoch": 4.040285899935022, "grad_norm": 0.9506875276565552, "learning_rate": 9.597141000649773e-06, "loss": 0.136, "step": 31090 }, { "epoch": 4.041585445094217, "grad_norm": 0.36133673787117004, "learning_rate": 9.584145549057829e-06, "loss": 0.0795, "step": 31100 }, { "epoch": 4.042884990253412, "grad_norm": 0.3461097776889801, "learning_rate": 9.571150097465888e-06, "loss": 0.0908, "step": 31110 }, { "epoch": 4.044184535412605, "grad_norm": 0.4224611818790436, "learning_rate": 9.558154645873944e-06, "loss": 0.1098, "step": 31120 }, { "epoch": 4.0454840805718, "grad_norm": 0.31589728593826294, "learning_rate": 9.545159194282002e-06, "loss": 0.0823, "step": 31130 }, { "epoch": 4.046783625730994, "grad_norm": 0.7337800860404968, "learning_rate": 9.532163742690059e-06, "loss": 0.115, "step": 31140 }, { "epoch": 4.048083170890188, "grad_norm": 0.17962293326854706, "learning_rate": 9.519168291098115e-06, "loss": 0.1094, "step": 31150 }, { "epoch": 4.049382716049383, "grad_norm": 0.3864992558956146, "learning_rate": 9.506172839506174e-06, "loss": 0.0751, "step": 31160 }, { "epoch": 4.050682261208577, "grad_norm": 0.6534704566001892, "learning_rate": 9.49317738791423e-06, "loss": 0.0902, "step": 31170 }, { "epoch": 4.051981806367771, "grad_norm": 0.6617427468299866, "learning_rate": 9.480181936322288e-06, "loss": 0.1495, "step": 31180 }, { "epoch": 4.053281351526966, "grad_norm": 0.5013090372085571, "learning_rate": 9.467186484730345e-06, "loss": 0.094, "step": 31190 }, { "epoch": 4.05458089668616, "grad_norm": 0.45164239406585693, "learning_rate": 9.454191033138401e-06, "loss": 0.1948, "step": 31200 }, { "epoch": 4.055880441845354, "grad_norm": 0.3660009503364563, "learning_rate": 9.44119558154646e-06, "loss": 0.1399, "step": 31210 }, { "epoch": 4.057179987004549, "grad_norm": 0.29993653297424316, "learning_rate": 9.428200129954516e-06, "loss": 0.0932, "step": 31220 }, { "epoch": 4.058479532163743, "grad_norm": 0.18882910907268524, "learning_rate": 9.415204678362574e-06, "loss": 0.1681, "step": 31230 }, { "epoch": 4.059779077322937, "grad_norm": 0.36240941286087036, "learning_rate": 9.402209226770631e-06, "loss": 0.1079, "step": 31240 }, { "epoch": 4.061078622482131, "grad_norm": 0.587300181388855, "learning_rate": 9.389213775178687e-06, "loss": 0.2205, "step": 31250 }, { "epoch": 4.0623781676413255, "grad_norm": 0.7184128761291504, "learning_rate": 9.376218323586745e-06, "loss": 0.1035, "step": 31260 }, { "epoch": 4.06367771280052, "grad_norm": 0.8275047540664673, "learning_rate": 9.363222871994802e-06, "loss": 0.0959, "step": 31270 }, { "epoch": 4.064977257959714, "grad_norm": 0.22384397685527802, "learning_rate": 9.35022742040286e-06, "loss": 0.121, "step": 31280 }, { "epoch": 4.0662768031189085, "grad_norm": 0.13948528468608856, "learning_rate": 9.337231968810917e-06, "loss": 0.0936, "step": 31290 }, { "epoch": 4.067576348278102, "grad_norm": 0.24806660413742065, "learning_rate": 9.324236517218973e-06, "loss": 0.1019, "step": 31300 }, { "epoch": 4.068875893437297, "grad_norm": 0.1992652714252472, "learning_rate": 9.31124106562703e-06, "loss": 0.0779, "step": 31310 }, { "epoch": 4.0701754385964914, "grad_norm": 0.3545239269733429, "learning_rate": 9.298245614035088e-06, "loss": 0.1047, "step": 31320 }, { "epoch": 4.071474983755685, "grad_norm": 0.6273838877677917, "learning_rate": 9.285250162443146e-06, "loss": 0.1064, "step": 31330 }, { "epoch": 4.07277452891488, "grad_norm": 0.18961508572101593, "learning_rate": 9.272254710851203e-06, "loss": 0.1048, "step": 31340 }, { "epoch": 4.074074074074074, "grad_norm": 0.6849515438079834, "learning_rate": 9.259259259259259e-06, "loss": 0.0952, "step": 31350 }, { "epoch": 4.075373619233268, "grad_norm": 0.532181441783905, "learning_rate": 9.246263807667317e-06, "loss": 0.0949, "step": 31360 }, { "epoch": 4.076673164392463, "grad_norm": 0.2903232276439667, "learning_rate": 9.233268356075374e-06, "loss": 0.0938, "step": 31370 }, { "epoch": 4.077972709551657, "grad_norm": 0.7329447865486145, "learning_rate": 9.220272904483432e-06, "loss": 0.0904, "step": 31380 }, { "epoch": 4.079272254710851, "grad_norm": 0.6112738251686096, "learning_rate": 9.207277452891488e-06, "loss": 0.1133, "step": 31390 }, { "epoch": 4.080571799870046, "grad_norm": 0.3732079565525055, "learning_rate": 9.194282001299545e-06, "loss": 0.0974, "step": 31400 }, { "epoch": 4.081871345029239, "grad_norm": 0.2330198585987091, "learning_rate": 9.181286549707603e-06, "loss": 0.1051, "step": 31410 }, { "epoch": 4.083170890188434, "grad_norm": 0.46181049942970276, "learning_rate": 9.16829109811566e-06, "loss": 0.1663, "step": 31420 }, { "epoch": 4.084470435347629, "grad_norm": 0.6631891131401062, "learning_rate": 9.155295646523718e-06, "loss": 0.0768, "step": 31430 }, { "epoch": 4.085769980506822, "grad_norm": 0.8520461916923523, "learning_rate": 9.142300194931774e-06, "loss": 0.1165, "step": 31440 }, { "epoch": 4.087069525666017, "grad_norm": 0.48602572083473206, "learning_rate": 9.129304743339831e-06, "loss": 0.1201, "step": 31450 }, { "epoch": 4.088369070825211, "grad_norm": 0.7490503191947937, "learning_rate": 9.116309291747889e-06, "loss": 0.0877, "step": 31460 }, { "epoch": 4.089668615984405, "grad_norm": 0.5194817185401917, "learning_rate": 9.103313840155946e-06, "loss": 0.1885, "step": 31470 }, { "epoch": 4.0909681611436, "grad_norm": 0.22110489010810852, "learning_rate": 9.090318388564004e-06, "loss": 0.0977, "step": 31480 }, { "epoch": 4.092267706302794, "grad_norm": 0.5808709859848022, "learning_rate": 9.07732293697206e-06, "loss": 0.1135, "step": 31490 }, { "epoch": 4.093567251461988, "grad_norm": 0.3081527054309845, "learning_rate": 9.064327485380117e-06, "loss": 0.1074, "step": 31500 }, { "epoch": 4.094866796621183, "grad_norm": 0.7410218119621277, "learning_rate": 9.051332033788175e-06, "loss": 0.1055, "step": 31510 }, { "epoch": 4.096166341780377, "grad_norm": 0.2173004299402237, "learning_rate": 9.03833658219623e-06, "loss": 0.1229, "step": 31520 }, { "epoch": 4.097465886939571, "grad_norm": 0.3673243820667267, "learning_rate": 9.02534113060429e-06, "loss": 0.1436, "step": 31530 }, { "epoch": 4.098765432098766, "grad_norm": 0.6413574814796448, "learning_rate": 9.012345679012346e-06, "loss": 0.0875, "step": 31540 }, { "epoch": 4.10006497725796, "grad_norm": 0.47068557143211365, "learning_rate": 8.999350227420403e-06, "loss": 0.097, "step": 31550 }, { "epoch": 4.101364522417154, "grad_norm": 0.8649875521659851, "learning_rate": 8.986354775828461e-06, "loss": 0.0802, "step": 31560 }, { "epoch": 4.102664067576348, "grad_norm": 0.11266093701124191, "learning_rate": 8.973359324236517e-06, "loss": 0.1026, "step": 31570 }, { "epoch": 4.1039636127355426, "grad_norm": 0.5673253536224365, "learning_rate": 8.960363872644576e-06, "loss": 0.1154, "step": 31580 }, { "epoch": 4.105263157894737, "grad_norm": 0.9379377365112305, "learning_rate": 8.947368421052632e-06, "loss": 0.1708, "step": 31590 }, { "epoch": 4.106562703053931, "grad_norm": 2.2903223037719727, "learning_rate": 8.93437296946069e-06, "loss": 0.1563, "step": 31600 }, { "epoch": 4.1078622482131255, "grad_norm": 1.3730247020721436, "learning_rate": 8.921377517868747e-06, "loss": 0.1541, "step": 31610 }, { "epoch": 4.109161793372319, "grad_norm": 0.39890724420547485, "learning_rate": 8.908382066276803e-06, "loss": 0.1214, "step": 31620 }, { "epoch": 4.110461338531514, "grad_norm": 0.5378705263137817, "learning_rate": 8.895386614684862e-06, "loss": 0.1212, "step": 31630 }, { "epoch": 4.1117608836907085, "grad_norm": 0.34709444642066956, "learning_rate": 8.882391163092918e-06, "loss": 0.083, "step": 31640 }, { "epoch": 4.113060428849902, "grad_norm": 0.25457337498664856, "learning_rate": 8.869395711500974e-06, "loss": 0.1276, "step": 31650 }, { "epoch": 4.114359974009097, "grad_norm": 0.552724301815033, "learning_rate": 8.856400259909033e-06, "loss": 0.1175, "step": 31660 }, { "epoch": 4.115659519168291, "grad_norm": 0.5782431960105896, "learning_rate": 8.843404808317089e-06, "loss": 0.1546, "step": 31670 }, { "epoch": 4.116959064327485, "grad_norm": 0.27863526344299316, "learning_rate": 8.830409356725146e-06, "loss": 0.1239, "step": 31680 }, { "epoch": 4.11825860948668, "grad_norm": 0.747110903263092, "learning_rate": 8.817413905133204e-06, "loss": 0.1218, "step": 31690 }, { "epoch": 4.119558154645874, "grad_norm": 0.31467151641845703, "learning_rate": 8.80441845354126e-06, "loss": 0.0928, "step": 31700 }, { "epoch": 4.120857699805068, "grad_norm": 1.0318827629089355, "learning_rate": 8.791423001949319e-06, "loss": 0.1038, "step": 31710 }, { "epoch": 4.122157244964263, "grad_norm": 0.5106990933418274, "learning_rate": 8.778427550357375e-06, "loss": 0.0896, "step": 31720 }, { "epoch": 4.1234567901234565, "grad_norm": 0.8817384243011475, "learning_rate": 8.765432098765432e-06, "loss": 0.1025, "step": 31730 }, { "epoch": 4.124756335282651, "grad_norm": 0.9949650168418884, "learning_rate": 8.75243664717349e-06, "loss": 0.1529, "step": 31740 }, { "epoch": 4.126055880441846, "grad_norm": 0.3410235345363617, "learning_rate": 8.739441195581546e-06, "loss": 0.1159, "step": 31750 }, { "epoch": 4.127355425601039, "grad_norm": 0.4217677712440491, "learning_rate": 8.726445743989605e-06, "loss": 0.0895, "step": 31760 }, { "epoch": 4.128654970760234, "grad_norm": 0.32272765040397644, "learning_rate": 8.713450292397661e-06, "loss": 0.1503, "step": 31770 }, { "epoch": 4.129954515919428, "grad_norm": 0.5218570828437805, "learning_rate": 8.700454840805719e-06, "loss": 0.101, "step": 31780 }, { "epoch": 4.131254061078622, "grad_norm": 0.17331381142139435, "learning_rate": 8.687459389213776e-06, "loss": 0.0639, "step": 31790 }, { "epoch": 4.132553606237817, "grad_norm": 0.246412456035614, "learning_rate": 8.674463937621832e-06, "loss": 0.0904, "step": 31800 }, { "epoch": 4.133853151397011, "grad_norm": 0.4997900426387787, "learning_rate": 8.66146848602989e-06, "loss": 0.1092, "step": 31810 }, { "epoch": 4.135152696556205, "grad_norm": 0.5617451667785645, "learning_rate": 8.648473034437947e-06, "loss": 0.1625, "step": 31820 }, { "epoch": 4.1364522417154, "grad_norm": 0.4024832844734192, "learning_rate": 8.635477582846005e-06, "loss": 0.1336, "step": 31830 }, { "epoch": 4.137751786874594, "grad_norm": 0.3087962865829468, "learning_rate": 8.622482131254062e-06, "loss": 0.1712, "step": 31840 }, { "epoch": 4.139051332033788, "grad_norm": 0.149591863155365, "learning_rate": 8.609486679662118e-06, "loss": 0.0685, "step": 31850 }, { "epoch": 4.140350877192983, "grad_norm": 0.1696351021528244, "learning_rate": 8.596491228070176e-06, "loss": 0.1769, "step": 31860 }, { "epoch": 4.141650422352177, "grad_norm": 0.3212931156158447, "learning_rate": 8.583495776478233e-06, "loss": 0.0651, "step": 31870 }, { "epoch": 4.142949967511371, "grad_norm": 0.8253023028373718, "learning_rate": 8.57050032488629e-06, "loss": 0.1314, "step": 31880 }, { "epoch": 4.144249512670565, "grad_norm": 0.21792691946029663, "learning_rate": 8.557504873294348e-06, "loss": 0.1491, "step": 31890 }, { "epoch": 4.14554905782976, "grad_norm": 0.5127707719802856, "learning_rate": 8.544509421702404e-06, "loss": 0.0816, "step": 31900 }, { "epoch": 4.146848602988954, "grad_norm": 0.466878741979599, "learning_rate": 8.531513970110462e-06, "loss": 0.0782, "step": 31910 }, { "epoch": 4.148148148148148, "grad_norm": 0.9106553792953491, "learning_rate": 8.518518518518519e-06, "loss": 0.1092, "step": 31920 }, { "epoch": 4.1494476933073425, "grad_norm": 0.07110325992107391, "learning_rate": 8.505523066926577e-06, "loss": 0.0782, "step": 31930 }, { "epoch": 4.150747238466536, "grad_norm": 0.5295692682266235, "learning_rate": 8.492527615334633e-06, "loss": 0.1441, "step": 31940 }, { "epoch": 4.152046783625731, "grad_norm": 0.39170539379119873, "learning_rate": 8.47953216374269e-06, "loss": 0.1492, "step": 31950 }, { "epoch": 4.1533463287849255, "grad_norm": 0.569768488407135, "learning_rate": 8.466536712150748e-06, "loss": 0.1075, "step": 31960 }, { "epoch": 4.154645873944119, "grad_norm": 0.30694857239723206, "learning_rate": 8.453541260558805e-06, "loss": 0.1308, "step": 31970 }, { "epoch": 4.155945419103314, "grad_norm": 0.15752464532852173, "learning_rate": 8.440545808966863e-06, "loss": 0.0729, "step": 31980 }, { "epoch": 4.1572449642625084, "grad_norm": 1.6636604070663452, "learning_rate": 8.427550357374919e-06, "loss": 0.1397, "step": 31990 }, { "epoch": 4.158544509421702, "grad_norm": 0.48259931802749634, "learning_rate": 8.414554905782976e-06, "loss": 0.1426, "step": 32000 }, { "epoch": 4.159844054580897, "grad_norm": 0.6735050678253174, "learning_rate": 8.401559454191034e-06, "loss": 0.0644, "step": 32010 }, { "epoch": 4.161143599740091, "grad_norm": 0.44808265566825867, "learning_rate": 8.388564002599091e-06, "loss": 0.1403, "step": 32020 }, { "epoch": 4.162443144899285, "grad_norm": 0.14214354753494263, "learning_rate": 8.375568551007149e-06, "loss": 0.0962, "step": 32030 }, { "epoch": 4.16374269005848, "grad_norm": 0.13382405042648315, "learning_rate": 8.362573099415205e-06, "loss": 0.0838, "step": 32040 }, { "epoch": 4.1650422352176735, "grad_norm": 0.7722732424736023, "learning_rate": 8.349577647823262e-06, "loss": 0.0752, "step": 32050 }, { "epoch": 4.166341780376868, "grad_norm": 0.29692700505256653, "learning_rate": 8.33658219623132e-06, "loss": 0.1186, "step": 32060 }, { "epoch": 4.167641325536063, "grad_norm": 0.8364206552505493, "learning_rate": 8.323586744639376e-06, "loss": 0.0907, "step": 32070 }, { "epoch": 4.168940870695256, "grad_norm": 0.19795498251914978, "learning_rate": 8.310591293047435e-06, "loss": 0.0766, "step": 32080 }, { "epoch": 4.170240415854451, "grad_norm": 0.1429315209388733, "learning_rate": 8.29759584145549e-06, "loss": 0.1076, "step": 32090 }, { "epoch": 4.171539961013645, "grad_norm": 0.5462930202484131, "learning_rate": 8.284600389863548e-06, "loss": 0.0937, "step": 32100 }, { "epoch": 4.172839506172839, "grad_norm": 0.49392515420913696, "learning_rate": 8.271604938271606e-06, "loss": 0.1236, "step": 32110 }, { "epoch": 4.174139051332034, "grad_norm": 0.12116701900959015, "learning_rate": 8.258609486679662e-06, "loss": 0.0917, "step": 32120 }, { "epoch": 4.175438596491228, "grad_norm": 0.21249671280384064, "learning_rate": 8.245614035087721e-06, "loss": 0.1005, "step": 32130 }, { "epoch": 4.176738141650422, "grad_norm": 0.5760244727134705, "learning_rate": 8.232618583495777e-06, "loss": 0.1241, "step": 32140 }, { "epoch": 4.178037686809617, "grad_norm": 1.343440294265747, "learning_rate": 8.219623131903834e-06, "loss": 0.1266, "step": 32150 }, { "epoch": 4.179337231968811, "grad_norm": 0.9204714298248291, "learning_rate": 8.206627680311892e-06, "loss": 0.1096, "step": 32160 }, { "epoch": 4.180636777128005, "grad_norm": 0.2963019907474518, "learning_rate": 8.193632228719948e-06, "loss": 0.1261, "step": 32170 }, { "epoch": 4.1819363222872, "grad_norm": 0.12872755527496338, "learning_rate": 8.180636777128007e-06, "loss": 0.0903, "step": 32180 }, { "epoch": 4.183235867446394, "grad_norm": 0.9757261872291565, "learning_rate": 8.167641325536063e-06, "loss": 0.085, "step": 32190 }, { "epoch": 4.184535412605588, "grad_norm": 0.3314783275127411, "learning_rate": 8.154645873944119e-06, "loss": 0.1075, "step": 32200 }, { "epoch": 4.185834957764782, "grad_norm": 0.12940701842308044, "learning_rate": 8.141650422352178e-06, "loss": 0.084, "step": 32210 }, { "epoch": 4.187134502923977, "grad_norm": 0.8537852764129639, "learning_rate": 8.128654970760234e-06, "loss": 0.1193, "step": 32220 }, { "epoch": 4.188434048083171, "grad_norm": 0.14699527621269226, "learning_rate": 8.115659519168291e-06, "loss": 0.0932, "step": 32230 }, { "epoch": 4.189733593242365, "grad_norm": 0.722428560256958, "learning_rate": 8.102664067576349e-06, "loss": 0.1387, "step": 32240 }, { "epoch": 4.1910331384015596, "grad_norm": 0.2506982982158661, "learning_rate": 8.089668615984405e-06, "loss": 0.087, "step": 32250 }, { "epoch": 4.192332683560753, "grad_norm": 0.8658591508865356, "learning_rate": 8.076673164392464e-06, "loss": 0.1109, "step": 32260 }, { "epoch": 4.193632228719948, "grad_norm": 0.3252827525138855, "learning_rate": 8.06367771280052e-06, "loss": 0.0889, "step": 32270 }, { "epoch": 4.1949317738791425, "grad_norm": 0.7993102073669434, "learning_rate": 8.050682261208577e-06, "loss": 0.1319, "step": 32280 }, { "epoch": 4.196231319038336, "grad_norm": 0.7462859153747559, "learning_rate": 8.037686809616635e-06, "loss": 0.0847, "step": 32290 }, { "epoch": 4.197530864197531, "grad_norm": 0.38859686255455017, "learning_rate": 8.02469135802469e-06, "loss": 0.0975, "step": 32300 }, { "epoch": 4.1988304093567255, "grad_norm": 0.37088650465011597, "learning_rate": 8.01169590643275e-06, "loss": 0.0995, "step": 32310 }, { "epoch": 4.200129954515919, "grad_norm": 0.6511968374252319, "learning_rate": 7.998700454840806e-06, "loss": 0.1096, "step": 32320 }, { "epoch": 4.201429499675114, "grad_norm": 0.5671115517616272, "learning_rate": 7.985705003248863e-06, "loss": 0.1018, "step": 32330 }, { "epoch": 4.202729044834308, "grad_norm": 0.8904199004173279, "learning_rate": 7.972709551656921e-06, "loss": 0.1022, "step": 32340 }, { "epoch": 4.204028589993502, "grad_norm": 0.1945429891347885, "learning_rate": 7.959714100064977e-06, "loss": 0.0943, "step": 32350 }, { "epoch": 4.205328135152697, "grad_norm": 0.38709965348243713, "learning_rate": 7.946718648473034e-06, "loss": 0.1187, "step": 32360 }, { "epoch": 4.2066276803118905, "grad_norm": 0.4061833918094635, "learning_rate": 7.933723196881092e-06, "loss": 0.065, "step": 32370 }, { "epoch": 4.207927225471085, "grad_norm": 0.18162167072296143, "learning_rate": 7.92072774528915e-06, "loss": 0.1241, "step": 32380 }, { "epoch": 4.20922677063028, "grad_norm": 0.2341485172510147, "learning_rate": 7.907732293697207e-06, "loss": 0.0808, "step": 32390 }, { "epoch": 4.2105263157894735, "grad_norm": 0.40126708149909973, "learning_rate": 7.894736842105263e-06, "loss": 0.0813, "step": 32400 }, { "epoch": 4.211825860948668, "grad_norm": 0.609703540802002, "learning_rate": 7.88174139051332e-06, "loss": 0.1211, "step": 32410 }, { "epoch": 4.213125406107862, "grad_norm": 0.1098548099398613, "learning_rate": 7.868745938921378e-06, "loss": 0.0944, "step": 32420 }, { "epoch": 4.214424951267056, "grad_norm": 1.0364534854888916, "learning_rate": 7.855750487329436e-06, "loss": 0.1399, "step": 32430 }, { "epoch": 4.215724496426251, "grad_norm": 0.788777768611908, "learning_rate": 7.842755035737493e-06, "loss": 0.101, "step": 32440 }, { "epoch": 4.217024041585445, "grad_norm": 0.9043503403663635, "learning_rate": 7.829759584145549e-06, "loss": 0.1784, "step": 32450 }, { "epoch": 4.218323586744639, "grad_norm": 0.3858771324157715, "learning_rate": 7.816764132553607e-06, "loss": 0.0992, "step": 32460 }, { "epoch": 4.219623131903834, "grad_norm": 0.5037456154823303, "learning_rate": 7.803768680961664e-06, "loss": 0.1334, "step": 32470 }, { "epoch": 4.220922677063028, "grad_norm": 0.6036518216133118, "learning_rate": 7.790773229369722e-06, "loss": 0.0737, "step": 32480 }, { "epoch": 4.222222222222222, "grad_norm": 0.5021937489509583, "learning_rate": 7.777777777777777e-06, "loss": 0.1253, "step": 32490 }, { "epoch": 4.223521767381417, "grad_norm": 0.5061480402946472, "learning_rate": 7.764782326185835e-06, "loss": 0.1356, "step": 32500 }, { "epoch": 4.224821312540611, "grad_norm": 0.9929001927375793, "learning_rate": 7.751786874593893e-06, "loss": 0.119, "step": 32510 }, { "epoch": 4.226120857699805, "grad_norm": 0.4470462501049042, "learning_rate": 7.73879142300195e-06, "loss": 0.0971, "step": 32520 }, { "epoch": 4.227420402858999, "grad_norm": 0.2854268252849579, "learning_rate": 7.725795971410008e-06, "loss": 0.1145, "step": 32530 }, { "epoch": 4.228719948018194, "grad_norm": 0.3426765203475952, "learning_rate": 7.712800519818064e-06, "loss": 0.0781, "step": 32540 }, { "epoch": 4.230019493177388, "grad_norm": 0.9819892644882202, "learning_rate": 7.699805068226121e-06, "loss": 0.1095, "step": 32550 }, { "epoch": 4.231319038336582, "grad_norm": 1.3273334503173828, "learning_rate": 7.686809616634179e-06, "loss": 0.1278, "step": 32560 }, { "epoch": 4.232618583495777, "grad_norm": 0.26352208852767944, "learning_rate": 7.673814165042236e-06, "loss": 0.17, "step": 32570 }, { "epoch": 4.23391812865497, "grad_norm": 0.24378056824207306, "learning_rate": 7.660818713450294e-06, "loss": 0.1334, "step": 32580 }, { "epoch": 4.235217673814165, "grad_norm": 0.4970357120037079, "learning_rate": 7.64782326185835e-06, "loss": 0.115, "step": 32590 }, { "epoch": 4.2365172189733595, "grad_norm": 0.4905957281589508, "learning_rate": 7.634827810266407e-06, "loss": 0.0904, "step": 32600 }, { "epoch": 4.237816764132553, "grad_norm": 0.504687488079071, "learning_rate": 7.621832358674465e-06, "loss": 0.0984, "step": 32610 }, { "epoch": 4.239116309291748, "grad_norm": 0.6907485127449036, "learning_rate": 7.608836907082521e-06, "loss": 0.1076, "step": 32620 }, { "epoch": 4.2404158544509425, "grad_norm": 0.5813502073287964, "learning_rate": 7.595841455490579e-06, "loss": 0.1409, "step": 32630 }, { "epoch": 4.241715399610136, "grad_norm": 0.3326462507247925, "learning_rate": 7.582846003898636e-06, "loss": 0.1195, "step": 32640 }, { "epoch": 4.243014944769331, "grad_norm": 0.46223050355911255, "learning_rate": 7.569850552306692e-06, "loss": 0.1125, "step": 32650 }, { "epoch": 4.2443144899285254, "grad_norm": 0.5034714937210083, "learning_rate": 7.556855100714751e-06, "loss": 0.0994, "step": 32660 }, { "epoch": 4.245614035087719, "grad_norm": 0.6087998747825623, "learning_rate": 7.5438596491228074e-06, "loss": 0.1142, "step": 32670 }, { "epoch": 4.246913580246914, "grad_norm": 0.5305776596069336, "learning_rate": 7.530864197530865e-06, "loss": 0.1196, "step": 32680 }, { "epoch": 4.2482131254061075, "grad_norm": 0.47971224784851074, "learning_rate": 7.517868745938922e-06, "loss": 0.1216, "step": 32690 }, { "epoch": 4.249512670565302, "grad_norm": 0.5040158033370972, "learning_rate": 7.504873294346978e-06, "loss": 0.0982, "step": 32700 }, { "epoch": 4.250812215724497, "grad_norm": 0.03191360458731651, "learning_rate": 7.491877842755037e-06, "loss": 0.0801, "step": 32710 }, { "epoch": 4.2521117608836905, "grad_norm": 0.39796820282936096, "learning_rate": 7.4788823911630935e-06, "loss": 0.0768, "step": 32720 }, { "epoch": 4.253411306042885, "grad_norm": 0.46028682589530945, "learning_rate": 7.465886939571149e-06, "loss": 0.1057, "step": 32730 }, { "epoch": 4.254710851202079, "grad_norm": 0.3478633165359497, "learning_rate": 7.452891487979208e-06, "loss": 0.1031, "step": 32740 }, { "epoch": 4.256010396361273, "grad_norm": 0.843146800994873, "learning_rate": 7.4398960363872645e-06, "loss": 0.1191, "step": 32750 }, { "epoch": 4.257309941520468, "grad_norm": 0.4183702766895294, "learning_rate": 7.426900584795322e-06, "loss": 0.1053, "step": 32760 }, { "epoch": 4.258609486679662, "grad_norm": 0.4412970542907715, "learning_rate": 7.413905133203379e-06, "loss": 0.112, "step": 32770 }, { "epoch": 4.259909031838856, "grad_norm": 0.4955657720565796, "learning_rate": 7.400909681611435e-06, "loss": 0.1129, "step": 32780 }, { "epoch": 4.261208576998051, "grad_norm": 0.5733329653739929, "learning_rate": 7.387914230019494e-06, "loss": 0.1171, "step": 32790 }, { "epoch": 4.262508122157245, "grad_norm": 0.771000862121582, "learning_rate": 7.3749187784275505e-06, "loss": 0.1191, "step": 32800 }, { "epoch": 4.263807667316439, "grad_norm": 0.5677163600921631, "learning_rate": 7.361923326835608e-06, "loss": 0.0989, "step": 32810 }, { "epoch": 4.265107212475634, "grad_norm": 0.5143436789512634, "learning_rate": 7.348927875243665e-06, "loss": 0.081, "step": 32820 }, { "epoch": 4.266406757634828, "grad_norm": 0.23280102014541626, "learning_rate": 7.3359324236517215e-06, "loss": 0.0639, "step": 32830 }, { "epoch": 4.267706302794022, "grad_norm": 0.3691701889038086, "learning_rate": 7.32293697205978e-06, "loss": 0.0661, "step": 32840 }, { "epoch": 4.269005847953216, "grad_norm": 0.6580929756164551, "learning_rate": 7.3099415204678366e-06, "loss": 0.0818, "step": 32850 }, { "epoch": 4.270305393112411, "grad_norm": 0.3176690340042114, "learning_rate": 7.296946068875894e-06, "loss": 0.1132, "step": 32860 }, { "epoch": 4.271604938271605, "grad_norm": 0.7559297680854797, "learning_rate": 7.283950617283951e-06, "loss": 0.1068, "step": 32870 }, { "epoch": 4.272904483430799, "grad_norm": 0.3837917447090149, "learning_rate": 7.2709551656920075e-06, "loss": 0.0797, "step": 32880 }, { "epoch": 4.274204028589994, "grad_norm": 0.5778807401657104, "learning_rate": 7.257959714100065e-06, "loss": 0.0887, "step": 32890 }, { "epoch": 4.275503573749187, "grad_norm": 0.7370020151138306, "learning_rate": 7.244964262508122e-06, "loss": 0.0994, "step": 32900 }, { "epoch": 4.276803118908382, "grad_norm": 0.3712182343006134, "learning_rate": 7.23196881091618e-06, "loss": 0.1195, "step": 32910 }, { "epoch": 4.2781026640675766, "grad_norm": 0.410961389541626, "learning_rate": 7.218973359324237e-06, "loss": 0.1114, "step": 32920 }, { "epoch": 4.27940220922677, "grad_norm": 0.6179782748222351, "learning_rate": 7.2059779077322936e-06, "loss": 0.2039, "step": 32930 }, { "epoch": 4.280701754385965, "grad_norm": 0.4457748532295227, "learning_rate": 7.192982456140351e-06, "loss": 0.0959, "step": 32940 }, { "epoch": 4.2820012995451595, "grad_norm": 1.1463780403137207, "learning_rate": 7.179987004548408e-06, "loss": 0.1105, "step": 32950 }, { "epoch": 4.283300844704353, "grad_norm": 0.27236199378967285, "learning_rate": 7.166991552956466e-06, "loss": 0.1169, "step": 32960 }, { "epoch": 4.284600389863548, "grad_norm": 0.875171422958374, "learning_rate": 7.153996101364523e-06, "loss": 0.1656, "step": 32970 }, { "epoch": 4.2858999350227425, "grad_norm": 0.28401756286621094, "learning_rate": 7.14100064977258e-06, "loss": 0.1636, "step": 32980 }, { "epoch": 4.287199480181936, "grad_norm": 0.37999427318573, "learning_rate": 7.128005198180637e-06, "loss": 0.0793, "step": 32990 }, { "epoch": 4.288499025341131, "grad_norm": 1.097833514213562, "learning_rate": 7.115009746588694e-06, "loss": 0.1416, "step": 33000 }, { "epoch": 4.2897985705003245, "grad_norm": 0.2042083442211151, "learning_rate": 7.102014294996752e-06, "loss": 0.1143, "step": 33010 }, { "epoch": 4.291098115659519, "grad_norm": 1.8743904829025269, "learning_rate": 7.089018843404808e-06, "loss": 0.139, "step": 33020 }, { "epoch": 4.292397660818714, "grad_norm": 0.5303312540054321, "learning_rate": 7.076023391812865e-06, "loss": 0.1405, "step": 33030 }, { "epoch": 4.2936972059779075, "grad_norm": 0.29273921251296997, "learning_rate": 7.063027940220923e-06, "loss": 0.0893, "step": 33040 }, { "epoch": 4.294996751137102, "grad_norm": 0.6490885615348816, "learning_rate": 7.05003248862898e-06, "loss": 0.1256, "step": 33050 }, { "epoch": 4.296296296296296, "grad_norm": 0.1426430493593216, "learning_rate": 7.0370370370370375e-06, "loss": 0.0985, "step": 33060 }, { "epoch": 4.2975958414554905, "grad_norm": 0.5293628573417664, "learning_rate": 7.024041585445094e-06, "loss": 0.0836, "step": 33070 }, { "epoch": 4.298895386614685, "grad_norm": 0.40378981828689575, "learning_rate": 7.011046133853151e-06, "loss": 0.1152, "step": 33080 }, { "epoch": 4.300194931773879, "grad_norm": 0.5234165191650391, "learning_rate": 6.998050682261209e-06, "loss": 0.1148, "step": 33090 }, { "epoch": 4.301494476933073, "grad_norm": 0.8735635876655579, "learning_rate": 6.985055230669266e-06, "loss": 0.1191, "step": 33100 }, { "epoch": 4.302794022092268, "grad_norm": 0.17460979521274567, "learning_rate": 6.9720597790773235e-06, "loss": 0.0582, "step": 33110 }, { "epoch": 4.304093567251462, "grad_norm": 0.6215935349464417, "learning_rate": 6.95906432748538e-06, "loss": 0.0714, "step": 33120 }, { "epoch": 4.305393112410656, "grad_norm": 0.521101713180542, "learning_rate": 6.946068875893437e-06, "loss": 0.0918, "step": 33130 }, { "epoch": 4.306692657569851, "grad_norm": 0.2987120747566223, "learning_rate": 6.933073424301495e-06, "loss": 0.0759, "step": 33140 }, { "epoch": 4.307992202729045, "grad_norm": 0.9470245242118835, "learning_rate": 6.920077972709551e-06, "loss": 0.1567, "step": 33150 }, { "epoch": 4.309291747888239, "grad_norm": 0.4432067275047302, "learning_rate": 6.90708252111761e-06, "loss": 0.1026, "step": 33160 }, { "epoch": 4.310591293047433, "grad_norm": 0.5241031646728516, "learning_rate": 6.894087069525666e-06, "loss": 0.1291, "step": 33170 }, { "epoch": 4.311890838206628, "grad_norm": 0.5091931223869324, "learning_rate": 6.881091617933723e-06, "loss": 0.1346, "step": 33180 }, { "epoch": 4.313190383365822, "grad_norm": 0.7903463244438171, "learning_rate": 6.8680961663417806e-06, "loss": 0.1135, "step": 33190 }, { "epoch": 4.314489928525016, "grad_norm": 1.0706839561462402, "learning_rate": 6.855100714749837e-06, "loss": 0.1331, "step": 33200 }, { "epoch": 4.315789473684211, "grad_norm": 0.4563952386379242, "learning_rate": 6.842105263157896e-06, "loss": 0.0771, "step": 33210 }, { "epoch": 4.317089018843404, "grad_norm": 0.22194743156433105, "learning_rate": 6.829109811565952e-06, "loss": 0.0623, "step": 33220 }, { "epoch": 4.318388564002599, "grad_norm": 0.5865038633346558, "learning_rate": 6.816114359974009e-06, "loss": 0.0745, "step": 33230 }, { "epoch": 4.319688109161794, "grad_norm": 0.841828465461731, "learning_rate": 6.803118908382067e-06, "loss": 0.0973, "step": 33240 }, { "epoch": 4.320987654320987, "grad_norm": 0.4883880019187927, "learning_rate": 6.790123456790123e-06, "loss": 0.1096, "step": 33250 }, { "epoch": 4.322287199480182, "grad_norm": 0.46802064776420593, "learning_rate": 6.777128005198182e-06, "loss": 0.1081, "step": 33260 }, { "epoch": 4.3235867446393765, "grad_norm": 0.6837434768676758, "learning_rate": 6.764132553606238e-06, "loss": 0.0875, "step": 33270 }, { "epoch": 4.32488628979857, "grad_norm": 0.6929888129234314, "learning_rate": 6.751137102014294e-06, "loss": 0.0744, "step": 33280 }, { "epoch": 4.326185834957765, "grad_norm": 0.2573325037956238, "learning_rate": 6.738141650422353e-06, "loss": 0.0655, "step": 33290 }, { "epoch": 4.3274853801169595, "grad_norm": 1.0822219848632812, "learning_rate": 6.725146198830409e-06, "loss": 0.1569, "step": 33300 }, { "epoch": 4.328784925276153, "grad_norm": 1.5567187070846558, "learning_rate": 6.712150747238467e-06, "loss": 0.1665, "step": 33310 }, { "epoch": 4.330084470435348, "grad_norm": 0.10259564220905304, "learning_rate": 6.699155295646524e-06, "loss": 0.1147, "step": 33320 }, { "epoch": 4.331384015594542, "grad_norm": 0.3965739607810974, "learning_rate": 6.68615984405458e-06, "loss": 0.0964, "step": 33330 }, { "epoch": 4.332683560753736, "grad_norm": 0.3874206244945526, "learning_rate": 6.673164392462639e-06, "loss": 0.1265, "step": 33340 }, { "epoch": 4.333983105912931, "grad_norm": 0.6644031405448914, "learning_rate": 6.660168940870695e-06, "loss": 0.1046, "step": 33350 }, { "epoch": 4.3352826510721245, "grad_norm": 0.5959255695343018, "learning_rate": 6.647173489278753e-06, "loss": 0.0828, "step": 33360 }, { "epoch": 4.336582196231319, "grad_norm": 0.27525755763053894, "learning_rate": 6.63417803768681e-06, "loss": 0.1505, "step": 33370 }, { "epoch": 4.337881741390513, "grad_norm": 0.6128037571907043, "learning_rate": 6.621182586094866e-06, "loss": 0.0694, "step": 33380 }, { "epoch": 4.3391812865497075, "grad_norm": 0.774000883102417, "learning_rate": 6.608187134502925e-06, "loss": 0.1533, "step": 33390 }, { "epoch": 4.340480831708902, "grad_norm": 1.2889561653137207, "learning_rate": 6.5951916829109815e-06, "loss": 0.1402, "step": 33400 }, { "epoch": 4.341780376868096, "grad_norm": 0.29048630595207214, "learning_rate": 6.582196231319039e-06, "loss": 0.1243, "step": 33410 }, { "epoch": 4.34307992202729, "grad_norm": 0.33861273527145386, "learning_rate": 6.569200779727096e-06, "loss": 0.1409, "step": 33420 }, { "epoch": 4.344379467186485, "grad_norm": 0.47413840889930725, "learning_rate": 6.5562053281351524e-06, "loss": 0.0898, "step": 33430 }, { "epoch": 4.345679012345679, "grad_norm": 0.5792078971862793, "learning_rate": 6.54320987654321e-06, "loss": 0.1048, "step": 33440 }, { "epoch": 4.346978557504873, "grad_norm": 0.5003282427787781, "learning_rate": 6.530214424951267e-06, "loss": 0.149, "step": 33450 }, { "epoch": 4.348278102664068, "grad_norm": 0.28187450766563416, "learning_rate": 6.517218973359325e-06, "loss": 0.1397, "step": 33460 }, { "epoch": 4.349577647823262, "grad_norm": 0.4976404905319214, "learning_rate": 6.504223521767382e-06, "loss": 0.1134, "step": 33470 }, { "epoch": 4.350877192982456, "grad_norm": 0.8406165242195129, "learning_rate": 6.4912280701754385e-06, "loss": 0.1136, "step": 33480 }, { "epoch": 4.35217673814165, "grad_norm": 0.8033232688903809, "learning_rate": 6.478232618583496e-06, "loss": 0.083, "step": 33490 }, { "epoch": 4.353476283300845, "grad_norm": 1.7708494663238525, "learning_rate": 6.465237166991553e-06, "loss": 0.1808, "step": 33500 }, { "epoch": 4.354775828460039, "grad_norm": 0.8263327479362488, "learning_rate": 6.452241715399611e-06, "loss": 0.1112, "step": 33510 }, { "epoch": 4.356075373619233, "grad_norm": 0.43531233072280884, "learning_rate": 6.439246263807668e-06, "loss": 0.1164, "step": 33520 }, { "epoch": 4.357374918778428, "grad_norm": 0.42265012860298157, "learning_rate": 6.4262508122157245e-06, "loss": 0.1162, "step": 33530 }, { "epoch": 4.358674463937621, "grad_norm": 0.46213141083717346, "learning_rate": 6.413255360623782e-06, "loss": 0.1367, "step": 33540 }, { "epoch": 4.359974009096816, "grad_norm": 0.3359011113643646, "learning_rate": 6.400259909031839e-06, "loss": 0.1119, "step": 33550 }, { "epoch": 4.361273554256011, "grad_norm": 0.6322929859161377, "learning_rate": 6.387264457439897e-06, "loss": 0.1251, "step": 33560 }, { "epoch": 4.362573099415204, "grad_norm": 0.1565367430448532, "learning_rate": 6.374269005847953e-06, "loss": 0.1126, "step": 33570 }, { "epoch": 4.363872644574399, "grad_norm": 0.31050562858581543, "learning_rate": 6.36127355425601e-06, "loss": 0.085, "step": 33580 }, { "epoch": 4.3651721897335936, "grad_norm": 0.20204909145832062, "learning_rate": 6.348278102664068e-06, "loss": 0.1291, "step": 33590 }, { "epoch": 4.366471734892787, "grad_norm": 0.4057925343513489, "learning_rate": 6.335282651072125e-06, "loss": 0.1165, "step": 33600 }, { "epoch": 4.367771280051982, "grad_norm": 0.5840856432914734, "learning_rate": 6.322287199480182e-06, "loss": 0.1045, "step": 33610 }, { "epoch": 4.3690708252111765, "grad_norm": 0.8040991425514221, "learning_rate": 6.309291747888239e-06, "loss": 0.095, "step": 33620 }, { "epoch": 4.37037037037037, "grad_norm": 0.23970192670822144, "learning_rate": 6.296296296296296e-06, "loss": 0.1022, "step": 33630 }, { "epoch": 4.371669915529565, "grad_norm": 0.44096243381500244, "learning_rate": 6.283300844704354e-06, "loss": 0.1338, "step": 33640 }, { "epoch": 4.372969460688759, "grad_norm": 0.3946719765663147, "learning_rate": 6.270305393112411e-06, "loss": 0.1294, "step": 33650 }, { "epoch": 4.374269005847953, "grad_norm": 0.306661993265152, "learning_rate": 6.2573099415204685e-06, "loss": 0.0982, "step": 33660 }, { "epoch": 4.375568551007148, "grad_norm": 0.424548864364624, "learning_rate": 6.244314489928525e-06, "loss": 0.1398, "step": 33670 }, { "epoch": 4.3768680961663415, "grad_norm": 1.470564365386963, "learning_rate": 6.231319038336583e-06, "loss": 0.1161, "step": 33680 }, { "epoch": 4.378167641325536, "grad_norm": 0.22352568805217743, "learning_rate": 6.21832358674464e-06, "loss": 0.0861, "step": 33690 }, { "epoch": 4.37946718648473, "grad_norm": 0.5305229425430298, "learning_rate": 6.205328135152696e-06, "loss": 0.0689, "step": 33700 }, { "epoch": 4.3807667316439245, "grad_norm": 0.2171868234872818, "learning_rate": 6.192332683560754e-06, "loss": 0.087, "step": 33710 }, { "epoch": 4.382066276803119, "grad_norm": 0.2647916376590729, "learning_rate": 6.179337231968811e-06, "loss": 0.0776, "step": 33720 }, { "epoch": 4.383365821962313, "grad_norm": 0.48610761761665344, "learning_rate": 6.166341780376869e-06, "loss": 0.0985, "step": 33730 }, { "epoch": 4.3846653671215075, "grad_norm": 0.37340712547302246, "learning_rate": 6.1533463287849255e-06, "loss": 0.07, "step": 33740 }, { "epoch": 4.385964912280702, "grad_norm": 0.358388751745224, "learning_rate": 6.140350877192982e-06, "loss": 0.1287, "step": 33750 }, { "epoch": 4.387264457439896, "grad_norm": 0.507301926612854, "learning_rate": 6.12735542560104e-06, "loss": 0.096, "step": 33760 }, { "epoch": 4.38856400259909, "grad_norm": 0.6117141246795654, "learning_rate": 6.114359974009097e-06, "loss": 0.0968, "step": 33770 }, { "epoch": 4.389863547758285, "grad_norm": 0.71375972032547, "learning_rate": 6.101364522417154e-06, "loss": 0.0955, "step": 33780 }, { "epoch": 4.391163092917479, "grad_norm": 0.32994967699050903, "learning_rate": 6.0883690708252115e-06, "loss": 0.1125, "step": 33790 }, { "epoch": 4.392462638076673, "grad_norm": 0.9629419445991516, "learning_rate": 6.075373619233268e-06, "loss": 0.1067, "step": 33800 }, { "epoch": 4.393762183235867, "grad_norm": 0.8368945717811584, "learning_rate": 6.062378167641326e-06, "loss": 0.1743, "step": 33810 }, { "epoch": 4.395061728395062, "grad_norm": 0.1351398080587387, "learning_rate": 6.049382716049383e-06, "loss": 0.1465, "step": 33820 }, { "epoch": 4.396361273554256, "grad_norm": 1.2944058179855347, "learning_rate": 6.03638726445744e-06, "loss": 0.115, "step": 33830 }, { "epoch": 4.39766081871345, "grad_norm": 0.33239835500717163, "learning_rate": 6.023391812865498e-06, "loss": 0.1059, "step": 33840 }, { "epoch": 4.398960363872645, "grad_norm": 0.17253567278385162, "learning_rate": 6.010396361273554e-06, "loss": 0.1033, "step": 33850 }, { "epoch": 4.400259909031838, "grad_norm": 0.283682256937027, "learning_rate": 5.997400909681612e-06, "loss": 0.1121, "step": 33860 }, { "epoch": 4.401559454191033, "grad_norm": 0.24708080291748047, "learning_rate": 5.9844054580896685e-06, "loss": 0.112, "step": 33870 }, { "epoch": 4.402858999350228, "grad_norm": 0.41875019669532776, "learning_rate": 5.971410006497726e-06, "loss": 0.1047, "step": 33880 }, { "epoch": 4.404158544509421, "grad_norm": 0.5460824966430664, "learning_rate": 5.958414554905784e-06, "loss": 0.0792, "step": 33890 }, { "epoch": 4.405458089668616, "grad_norm": 0.941615879535675, "learning_rate": 5.94541910331384e-06, "loss": 0.0979, "step": 33900 }, { "epoch": 4.406757634827811, "grad_norm": 0.7224177122116089, "learning_rate": 5.932423651721897e-06, "loss": 0.1533, "step": 33910 }, { "epoch": 4.408057179987004, "grad_norm": 0.708080530166626, "learning_rate": 5.919428200129955e-06, "loss": 0.1198, "step": 33920 }, { "epoch": 4.409356725146199, "grad_norm": 0.45185387134552, "learning_rate": 5.906432748538012e-06, "loss": 0.1002, "step": 33930 }, { "epoch": 4.4106562703053935, "grad_norm": 0.7072089314460754, "learning_rate": 5.89343729694607e-06, "loss": 0.117, "step": 33940 }, { "epoch": 4.411955815464587, "grad_norm": 0.3460264801979065, "learning_rate": 5.880441845354126e-06, "loss": 0.1065, "step": 33950 }, { "epoch": 4.413255360623782, "grad_norm": 0.6136173605918884, "learning_rate": 5.867446393762183e-06, "loss": 0.0944, "step": 33960 }, { "epoch": 4.414554905782976, "grad_norm": 0.6327337026596069, "learning_rate": 5.854450942170241e-06, "loss": 0.0819, "step": 33970 }, { "epoch": 4.41585445094217, "grad_norm": 0.21780377626419067, "learning_rate": 5.841455490578298e-06, "loss": 0.0897, "step": 33980 }, { "epoch": 4.417153996101365, "grad_norm": 0.1420472264289856, "learning_rate": 5.828460038986355e-06, "loss": 0.0751, "step": 33990 }, { "epoch": 4.418453541260559, "grad_norm": 0.5648816823959351, "learning_rate": 5.815464587394412e-06, "loss": 0.1659, "step": 34000 }, { "epoch": 4.419753086419753, "grad_norm": 0.16575367748737335, "learning_rate": 5.802469135802469e-06, "loss": 0.1247, "step": 34010 }, { "epoch": 4.421052631578947, "grad_norm": 0.2328636199235916, "learning_rate": 5.789473684210527e-06, "loss": 0.1044, "step": 34020 }, { "epoch": 4.4223521767381415, "grad_norm": 0.6331861019134521, "learning_rate": 5.776478232618584e-06, "loss": 0.1115, "step": 34030 }, { "epoch": 4.423651721897336, "grad_norm": 0.1451825499534607, "learning_rate": 5.763482781026641e-06, "loss": 0.0814, "step": 34040 }, { "epoch": 4.42495126705653, "grad_norm": 0.16200406849384308, "learning_rate": 5.750487329434698e-06, "loss": 0.0905, "step": 34050 }, { "epoch": 4.4262508122157245, "grad_norm": 1.2885607481002808, "learning_rate": 5.737491877842755e-06, "loss": 0.1328, "step": 34060 }, { "epoch": 4.427550357374919, "grad_norm": 0.49686315655708313, "learning_rate": 5.724496426250813e-06, "loss": 0.0915, "step": 34070 }, { "epoch": 4.428849902534113, "grad_norm": 0.2661912143230438, "learning_rate": 5.7115009746588695e-06, "loss": 0.0953, "step": 34080 }, { "epoch": 4.430149447693307, "grad_norm": 0.25194811820983887, "learning_rate": 5.698505523066927e-06, "loss": 0.1061, "step": 34090 }, { "epoch": 4.431448992852502, "grad_norm": 0.6118634939193726, "learning_rate": 5.685510071474984e-06, "loss": 0.1127, "step": 34100 }, { "epoch": 4.432748538011696, "grad_norm": 1.2644850015640259, "learning_rate": 5.672514619883041e-06, "loss": 0.1379, "step": 34110 }, { "epoch": 4.43404808317089, "grad_norm": 0.4293118715286255, "learning_rate": 5.659519168291098e-06, "loss": 0.1242, "step": 34120 }, { "epoch": 4.435347628330084, "grad_norm": 0.6070493459701538, "learning_rate": 5.6465237166991555e-06, "loss": 0.091, "step": 34130 }, { "epoch": 4.436647173489279, "grad_norm": 0.3285188674926758, "learning_rate": 5.633528265107213e-06, "loss": 0.1133, "step": 34140 }, { "epoch": 4.437946718648473, "grad_norm": 0.8914385437965393, "learning_rate": 5.62053281351527e-06, "loss": 0.1342, "step": 34150 }, { "epoch": 4.439246263807667, "grad_norm": 0.7026621103286743, "learning_rate": 5.607537361923327e-06, "loss": 0.1234, "step": 34160 }, { "epoch": 4.440545808966862, "grad_norm": 0.5703632235527039, "learning_rate": 5.594541910331384e-06, "loss": 0.0985, "step": 34170 }, { "epoch": 4.441845354126055, "grad_norm": 0.46732375025749207, "learning_rate": 5.5815464587394416e-06, "loss": 0.1003, "step": 34180 }, { "epoch": 4.44314489928525, "grad_norm": 0.364181786775589, "learning_rate": 5.568551007147499e-06, "loss": 0.0782, "step": 34190 }, { "epoch": 4.444444444444445, "grad_norm": 0.3083246052265167, "learning_rate": 5.555555555555556e-06, "loss": 0.0856, "step": 34200 }, { "epoch": 4.445743989603638, "grad_norm": 0.2298300713300705, "learning_rate": 5.5425601039636125e-06, "loss": 0.1196, "step": 34210 }, { "epoch": 4.447043534762833, "grad_norm": 0.5567384362220764, "learning_rate": 5.52956465237167e-06, "loss": 0.1072, "step": 34220 }, { "epoch": 4.448343079922028, "grad_norm": 0.7244008779525757, "learning_rate": 5.516569200779728e-06, "loss": 0.0927, "step": 34230 }, { "epoch": 4.449642625081221, "grad_norm": 0.4753226339817047, "learning_rate": 5.503573749187785e-06, "loss": 0.1811, "step": 34240 }, { "epoch": 4.450942170240416, "grad_norm": 0.4359007179737091, "learning_rate": 5.490578297595841e-06, "loss": 0.0673, "step": 34250 }, { "epoch": 4.4522417153996106, "grad_norm": 0.5568147897720337, "learning_rate": 5.477582846003899e-06, "loss": 0.127, "step": 34260 }, { "epoch": 4.453541260558804, "grad_norm": 0.13184954226016998, "learning_rate": 5.464587394411956e-06, "loss": 0.0944, "step": 34270 }, { "epoch": 4.454840805717999, "grad_norm": 0.17453378438949585, "learning_rate": 5.451591942820014e-06, "loss": 0.0733, "step": 34280 }, { "epoch": 4.456140350877193, "grad_norm": 0.5418881177902222, "learning_rate": 5.43859649122807e-06, "loss": 0.0927, "step": 34290 }, { "epoch": 4.457439896036387, "grad_norm": 0.22708873450756073, "learning_rate": 5.425601039636127e-06, "loss": 0.1767, "step": 34300 }, { "epoch": 4.458739441195582, "grad_norm": 0.47889193892478943, "learning_rate": 5.412605588044185e-06, "loss": 0.1118, "step": 34310 }, { "epoch": 4.460038986354776, "grad_norm": 1.4482580423355103, "learning_rate": 5.399610136452242e-06, "loss": 0.1048, "step": 34320 }, { "epoch": 4.46133853151397, "grad_norm": 0.6409318447113037, "learning_rate": 5.386614684860299e-06, "loss": 0.1332, "step": 34330 }, { "epoch": 4.462638076673164, "grad_norm": 0.8259595036506653, "learning_rate": 5.3736192332683564e-06, "loss": 0.1386, "step": 34340 }, { "epoch": 4.4639376218323585, "grad_norm": 0.5787337422370911, "learning_rate": 5.360623781676413e-06, "loss": 0.1462, "step": 34350 }, { "epoch": 4.465237166991553, "grad_norm": 0.7283847332000732, "learning_rate": 5.347628330084471e-06, "loss": 0.1285, "step": 34360 }, { "epoch": 4.466536712150747, "grad_norm": 0.5736411213874817, "learning_rate": 5.334632878492528e-06, "loss": 0.0856, "step": 34370 }, { "epoch": 4.4678362573099415, "grad_norm": 0.5143736600875854, "learning_rate": 5.321637426900585e-06, "loss": 0.1106, "step": 34380 }, { "epoch": 4.469135802469136, "grad_norm": 0.3588794767856598, "learning_rate": 5.3086419753086425e-06, "loss": 0.1153, "step": 34390 }, { "epoch": 4.47043534762833, "grad_norm": 0.3291316032409668, "learning_rate": 5.295646523716699e-06, "loss": 0.1163, "step": 34400 }, { "epoch": 4.4717348927875245, "grad_norm": 0.35122719407081604, "learning_rate": 5.282651072124757e-06, "loss": 0.1372, "step": 34410 }, { "epoch": 4.473034437946719, "grad_norm": 0.4236571192741394, "learning_rate": 5.2696556205328134e-06, "loss": 0.0961, "step": 34420 }, { "epoch": 4.474333983105913, "grad_norm": 0.6878372430801392, "learning_rate": 5.256660168940871e-06, "loss": 0.1034, "step": 34430 }, { "epoch": 4.475633528265107, "grad_norm": 0.4691615402698517, "learning_rate": 5.2436647173489285e-06, "loss": 0.1298, "step": 34440 }, { "epoch": 4.476933073424301, "grad_norm": 0.5674958229064941, "learning_rate": 5.230669265756985e-06, "loss": 0.1585, "step": 34450 }, { "epoch": 4.478232618583496, "grad_norm": 0.984259307384491, "learning_rate": 5.217673814165042e-06, "loss": 0.1432, "step": 34460 }, { "epoch": 4.47953216374269, "grad_norm": 0.334913969039917, "learning_rate": 5.2046783625730995e-06, "loss": 0.1439, "step": 34470 }, { "epoch": 4.480831708901884, "grad_norm": 0.5485069155693054, "learning_rate": 5.191682910981157e-06, "loss": 0.1121, "step": 34480 }, { "epoch": 4.482131254061079, "grad_norm": 0.3013074994087219, "learning_rate": 5.178687459389215e-06, "loss": 0.1026, "step": 34490 }, { "epoch": 4.483430799220272, "grad_norm": 0.15371070802211761, "learning_rate": 5.165692007797271e-06, "loss": 0.0796, "step": 34500 }, { "epoch": 4.484730344379467, "grad_norm": 0.6319029331207275, "learning_rate": 5.152696556205328e-06, "loss": 0.097, "step": 34510 }, { "epoch": 4.486029889538662, "grad_norm": 0.8656356334686279, "learning_rate": 5.1397011046133856e-06, "loss": 0.1207, "step": 34520 }, { "epoch": 4.487329434697855, "grad_norm": 0.6274484395980835, "learning_rate": 5.126705653021443e-06, "loss": 0.0949, "step": 34530 }, { "epoch": 4.48862897985705, "grad_norm": 0.696329653263092, "learning_rate": 5.1137102014295e-06, "loss": 0.1062, "step": 34540 }, { "epoch": 4.489928525016245, "grad_norm": 0.2333296239376068, "learning_rate": 5.1007147498375565e-06, "loss": 0.0755, "step": 34550 }, { "epoch": 4.491228070175438, "grad_norm": 0.5507047176361084, "learning_rate": 5.087719298245614e-06, "loss": 0.0973, "step": 34560 }, { "epoch": 4.492527615334633, "grad_norm": 0.4811822175979614, "learning_rate": 5.074723846653672e-06, "loss": 0.073, "step": 34570 }, { "epoch": 4.493827160493828, "grad_norm": 0.541387677192688, "learning_rate": 5.061728395061729e-06, "loss": 0.0686, "step": 34580 }, { "epoch": 4.495126705653021, "grad_norm": 1.2971529960632324, "learning_rate": 5.048732943469786e-06, "loss": 0.1764, "step": 34590 }, { "epoch": 4.496426250812216, "grad_norm": 0.4759012460708618, "learning_rate": 5.0357374918778426e-06, "loss": 0.0769, "step": 34600 }, { "epoch": 4.49772579597141, "grad_norm": 0.15824592113494873, "learning_rate": 5.0227420402859e-06, "loss": 0.1061, "step": 34610 }, { "epoch": 4.499025341130604, "grad_norm": 0.2644828259944916, "learning_rate": 5.009746588693958e-06, "loss": 0.101, "step": 34620 }, { "epoch": 4.500324886289799, "grad_norm": 0.189304381608963, "learning_rate": 4.996751137102014e-06, "loss": 0.0925, "step": 34630 }, { "epoch": 4.501624431448993, "grad_norm": 0.672249972820282, "learning_rate": 4.983755685510072e-06, "loss": 0.0978, "step": 34640 }, { "epoch": 4.502923976608187, "grad_norm": 0.9049472808837891, "learning_rate": 4.970760233918129e-06, "loss": 0.1226, "step": 34650 }, { "epoch": 4.504223521767381, "grad_norm": 0.3415215313434601, "learning_rate": 4.957764782326186e-06, "loss": 0.0876, "step": 34660 }, { "epoch": 4.505523066926576, "grad_norm": 0.32486459612846375, "learning_rate": 4.944769330734243e-06, "loss": 0.0718, "step": 34670 }, { "epoch": 4.50682261208577, "grad_norm": 0.20070666074752808, "learning_rate": 4.9317738791423004e-06, "loss": 0.088, "step": 34680 }, { "epoch": 4.508122157244964, "grad_norm": 0.6485567092895508, "learning_rate": 4.918778427550358e-06, "loss": 0.1166, "step": 34690 }, { "epoch": 4.5094217024041585, "grad_norm": 0.20083223283290863, "learning_rate": 4.905782975958415e-06, "loss": 0.1106, "step": 34700 }, { "epoch": 4.510721247563353, "grad_norm": 0.31681886315345764, "learning_rate": 4.892787524366472e-06, "loss": 0.0984, "step": 34710 }, { "epoch": 4.512020792722547, "grad_norm": 0.3880396783351898, "learning_rate": 4.879792072774529e-06, "loss": 0.1356, "step": 34720 }, { "epoch": 4.5133203378817415, "grad_norm": 0.13048504292964935, "learning_rate": 4.8667966211825865e-06, "loss": 0.1003, "step": 34730 }, { "epoch": 4.514619883040936, "grad_norm": 1.4645823240280151, "learning_rate": 4.853801169590644e-06, "loss": 0.1268, "step": 34740 }, { "epoch": 4.51591942820013, "grad_norm": 1.374050498008728, "learning_rate": 4.840805717998701e-06, "loss": 0.1109, "step": 34750 }, { "epoch": 4.517218973359324, "grad_norm": 1.0649964809417725, "learning_rate": 4.8278102664067574e-06, "loss": 0.1134, "step": 34760 }, { "epoch": 4.518518518518518, "grad_norm": 0.21920569241046906, "learning_rate": 4.814814814814815e-06, "loss": 0.167, "step": 34770 }, { "epoch": 4.519818063677713, "grad_norm": 0.45014166831970215, "learning_rate": 4.8018193632228725e-06, "loss": 0.0853, "step": 34780 }, { "epoch": 4.521117608836907, "grad_norm": 0.4282344877719879, "learning_rate": 4.78882391163093e-06, "loss": 0.119, "step": 34790 }, { "epoch": 4.522417153996101, "grad_norm": 0.39977720379829407, "learning_rate": 4.775828460038986e-06, "loss": 0.1184, "step": 34800 }, { "epoch": 4.523716699155296, "grad_norm": 0.1966175138950348, "learning_rate": 4.7628330084470435e-06, "loss": 0.102, "step": 34810 }, { "epoch": 4.5250162443144895, "grad_norm": 0.4748269021511078, "learning_rate": 4.749837556855101e-06, "loss": 0.1049, "step": 34820 }, { "epoch": 4.526315789473684, "grad_norm": 0.9046814441680908, "learning_rate": 4.736842105263159e-06, "loss": 0.1425, "step": 34830 }, { "epoch": 4.527615334632879, "grad_norm": 0.7178598046302795, "learning_rate": 4.723846653671215e-06, "loss": 0.1319, "step": 34840 }, { "epoch": 4.528914879792072, "grad_norm": 0.3367522358894348, "learning_rate": 4.710851202079272e-06, "loss": 0.1313, "step": 34850 }, { "epoch": 4.530214424951267, "grad_norm": 0.2832222282886505, "learning_rate": 4.6978557504873295e-06, "loss": 0.1057, "step": 34860 }, { "epoch": 4.531513970110462, "grad_norm": 0.6979503631591797, "learning_rate": 4.684860298895387e-06, "loss": 0.1695, "step": 34870 }, { "epoch": 4.532813515269655, "grad_norm": 0.6356257796287537, "learning_rate": 4.671864847303444e-06, "loss": 0.1284, "step": 34880 }, { "epoch": 4.53411306042885, "grad_norm": 1.3451128005981445, "learning_rate": 4.658869395711501e-06, "loss": 0.1609, "step": 34890 }, { "epoch": 4.535412605588045, "grad_norm": 0.887553870677948, "learning_rate": 4.645873944119558e-06, "loss": 0.1467, "step": 34900 }, { "epoch": 4.536712150747238, "grad_norm": 0.3518974781036377, "learning_rate": 4.632878492527616e-06, "loss": 0.0677, "step": 34910 }, { "epoch": 4.538011695906433, "grad_norm": 0.5294730067253113, "learning_rate": 4.619883040935673e-06, "loss": 0.0959, "step": 34920 }, { "epoch": 4.539311241065627, "grad_norm": 0.8345827460289001, "learning_rate": 4.60688758934373e-06, "loss": 0.1426, "step": 34930 }, { "epoch": 4.540610786224821, "grad_norm": 0.6401714086532593, "learning_rate": 4.593892137751787e-06, "loss": 0.1169, "step": 34940 }, { "epoch": 4.541910331384016, "grad_norm": 0.4239555597305298, "learning_rate": 4.580896686159844e-06, "loss": 0.1107, "step": 34950 }, { "epoch": 4.54320987654321, "grad_norm": 0.44285956025123596, "learning_rate": 4.567901234567902e-06, "loss": 0.1026, "step": 34960 }, { "epoch": 4.544509421702404, "grad_norm": 0.336091548204422, "learning_rate": 4.554905782975958e-06, "loss": 0.1105, "step": 34970 }, { "epoch": 4.545808966861598, "grad_norm": 0.4385092258453369, "learning_rate": 4.541910331384016e-06, "loss": 0.1519, "step": 34980 }, { "epoch": 4.547108512020793, "grad_norm": 0.2882421314716339, "learning_rate": 4.5289148797920735e-06, "loss": 0.1023, "step": 34990 }, { "epoch": 4.548408057179987, "grad_norm": 0.6224398612976074, "learning_rate": 4.51591942820013e-06, "loss": 0.1355, "step": 35000 }, { "epoch": 4.549707602339181, "grad_norm": 0.4545753002166748, "learning_rate": 4.502923976608187e-06, "loss": 0.0923, "step": 35010 }, { "epoch": 4.5510071474983755, "grad_norm": 0.3940757215023041, "learning_rate": 4.489928525016244e-06, "loss": 0.0953, "step": 35020 }, { "epoch": 4.55230669265757, "grad_norm": 0.8101934790611267, "learning_rate": 4.476933073424302e-06, "loss": 0.1613, "step": 35030 }, { "epoch": 4.553606237816764, "grad_norm": 0.8187233209609985, "learning_rate": 4.4639376218323595e-06, "loss": 0.1024, "step": 35040 }, { "epoch": 4.5549057829759585, "grad_norm": 0.14314158260822296, "learning_rate": 4.450942170240416e-06, "loss": 0.0813, "step": 35050 }, { "epoch": 4.556205328135153, "grad_norm": 0.4444246292114258, "learning_rate": 4.437946718648473e-06, "loss": 0.0901, "step": 35060 }, { "epoch": 4.557504873294347, "grad_norm": 0.5073850750923157, "learning_rate": 4.4249512670565305e-06, "loss": 0.0887, "step": 35070 }, { "epoch": 4.5588044184535415, "grad_norm": 1.1236085891723633, "learning_rate": 4.411955815464588e-06, "loss": 0.1222, "step": 35080 }, { "epoch": 4.560103963612735, "grad_norm": 0.6439655423164368, "learning_rate": 4.398960363872645e-06, "loss": 0.1588, "step": 35090 }, { "epoch": 4.56140350877193, "grad_norm": 0.6108437776565552, "learning_rate": 4.3859649122807014e-06, "loss": 0.1175, "step": 35100 }, { "epoch": 4.562703053931124, "grad_norm": 0.5070381164550781, "learning_rate": 4.372969460688759e-06, "loss": 0.1121, "step": 35110 }, { "epoch": 4.564002599090318, "grad_norm": 0.16895264387130737, "learning_rate": 4.3599740090968165e-06, "loss": 0.1245, "step": 35120 }, { "epoch": 4.565302144249513, "grad_norm": 0.9799762964248657, "learning_rate": 4.346978557504874e-06, "loss": 0.1291, "step": 35130 }, { "epoch": 4.5666016894087065, "grad_norm": 0.2959105968475342, "learning_rate": 4.333983105912931e-06, "loss": 0.1658, "step": 35140 }, { "epoch": 4.567901234567901, "grad_norm": 0.4628155529499054, "learning_rate": 4.3209876543209875e-06, "loss": 0.1351, "step": 35150 }, { "epoch": 4.569200779727096, "grad_norm": 0.5577920079231262, "learning_rate": 4.307992202729045e-06, "loss": 0.1064, "step": 35160 }, { "epoch": 4.570500324886289, "grad_norm": 0.8662489652633667, "learning_rate": 4.294996751137103e-06, "loss": 0.1435, "step": 35170 }, { "epoch": 4.571799870045484, "grad_norm": 0.2639419138431549, "learning_rate": 4.282001299545159e-06, "loss": 0.0777, "step": 35180 }, { "epoch": 4.573099415204679, "grad_norm": 1.2441622018814087, "learning_rate": 4.269005847953217e-06, "loss": 0.1214, "step": 35190 }, { "epoch": 4.574398960363872, "grad_norm": 0.31692951917648315, "learning_rate": 4.2560103963612735e-06, "loss": 0.0959, "step": 35200 }, { "epoch": 4.575698505523067, "grad_norm": 1.4765530824661255, "learning_rate": 4.243014944769331e-06, "loss": 0.1064, "step": 35210 }, { "epoch": 4.576998050682262, "grad_norm": 1.076117753982544, "learning_rate": 4.230019493177388e-06, "loss": 0.0727, "step": 35220 }, { "epoch": 4.578297595841455, "grad_norm": 0.9193745851516724, "learning_rate": 4.217024041585445e-06, "loss": 0.2117, "step": 35230 }, { "epoch": 4.57959714100065, "grad_norm": 1.0617538690567017, "learning_rate": 4.204028589993503e-06, "loss": 0.1773, "step": 35240 }, { "epoch": 4.580896686159844, "grad_norm": 0.8919404149055481, "learning_rate": 4.19103313840156e-06, "loss": 0.0935, "step": 35250 }, { "epoch": 4.582196231319038, "grad_norm": 0.10998523980379105, "learning_rate": 4.178037686809617e-06, "loss": 0.1372, "step": 35260 }, { "epoch": 4.583495776478233, "grad_norm": 0.23820380866527557, "learning_rate": 4.165042235217674e-06, "loss": 0.0949, "step": 35270 }, { "epoch": 4.584795321637427, "grad_norm": 0.4322490394115448, "learning_rate": 4.152046783625731e-06, "loss": 0.1762, "step": 35280 }, { "epoch": 4.586094866796621, "grad_norm": 0.4565349817276001, "learning_rate": 4.139051332033789e-06, "loss": 0.0944, "step": 35290 }, { "epoch": 4.587394411955815, "grad_norm": 0.4930018484592438, "learning_rate": 4.126055880441846e-06, "loss": 0.0876, "step": 35300 }, { "epoch": 4.58869395711501, "grad_norm": 1.057830810546875, "learning_rate": 4.113060428849902e-06, "loss": 0.0921, "step": 35310 }, { "epoch": 4.589993502274204, "grad_norm": 0.3306872844696045, "learning_rate": 4.10006497725796e-06, "loss": 0.1027, "step": 35320 }, { "epoch": 4.591293047433398, "grad_norm": 0.20814943313598633, "learning_rate": 4.0870695256660175e-06, "loss": 0.1233, "step": 35330 }, { "epoch": 4.592592592592593, "grad_norm": 0.6244149804115295, "learning_rate": 4.074074074074075e-06, "loss": 0.127, "step": 35340 }, { "epoch": 4.593892137751787, "grad_norm": 0.9854623675346375, "learning_rate": 4.061078622482131e-06, "loss": 0.1522, "step": 35350 }, { "epoch": 4.595191682910981, "grad_norm": 0.4683781862258911, "learning_rate": 4.048083170890188e-06, "loss": 0.1282, "step": 35360 }, { "epoch": 4.5964912280701755, "grad_norm": 0.412689208984375, "learning_rate": 4.035087719298246e-06, "loss": 0.0889, "step": 35370 }, { "epoch": 4.59779077322937, "grad_norm": 0.6633956432342529, "learning_rate": 4.0220922677063035e-06, "loss": 0.162, "step": 35380 }, { "epoch": 4.599090318388564, "grad_norm": 0.63885098695755, "learning_rate": 4.00909681611436e-06, "loss": 0.1021, "step": 35390 }, { "epoch": 4.6003898635477585, "grad_norm": 0.466661274433136, "learning_rate": 3.996101364522417e-06, "loss": 0.1022, "step": 35400 }, { "epoch": 4.601689408706952, "grad_norm": 0.37001675367355347, "learning_rate": 3.9831059129304745e-06, "loss": 0.1043, "step": 35410 }, { "epoch": 4.602988953866147, "grad_norm": 1.1947927474975586, "learning_rate": 3.970110461338532e-06, "loss": 0.1434, "step": 35420 }, { "epoch": 4.604288499025341, "grad_norm": 0.6335619688034058, "learning_rate": 3.957115009746589e-06, "loss": 0.1142, "step": 35430 }, { "epoch": 4.605588044184535, "grad_norm": 0.6841056942939758, "learning_rate": 3.944119558154646e-06, "loss": 0.084, "step": 35440 }, { "epoch": 4.60688758934373, "grad_norm": 0.4636789858341217, "learning_rate": 3.931124106562703e-06, "loss": 0.1012, "step": 35450 }, { "epoch": 4.6081871345029235, "grad_norm": 0.4273281991481781, "learning_rate": 3.9181286549707605e-06, "loss": 0.0825, "step": 35460 }, { "epoch": 4.609486679662118, "grad_norm": 0.6958406567573547, "learning_rate": 3.905133203378818e-06, "loss": 0.1584, "step": 35470 }, { "epoch": 4.610786224821313, "grad_norm": 0.12275560200214386, "learning_rate": 3.892137751786875e-06, "loss": 0.1217, "step": 35480 }, { "epoch": 4.6120857699805065, "grad_norm": 0.5583785176277161, "learning_rate": 3.879142300194932e-06, "loss": 0.1204, "step": 35490 }, { "epoch": 4.613385315139701, "grad_norm": 0.9693744778633118, "learning_rate": 3.866146848602989e-06, "loss": 0.1478, "step": 35500 }, { "epoch": 4.614684860298896, "grad_norm": 0.5763810276985168, "learning_rate": 3.8531513970110466e-06, "loss": 0.1456, "step": 35510 }, { "epoch": 4.615984405458089, "grad_norm": 0.17880038917064667, "learning_rate": 3.840155945419103e-06, "loss": 0.1237, "step": 35520 }, { "epoch": 4.617283950617284, "grad_norm": 0.4436105787754059, "learning_rate": 3.827160493827161e-06, "loss": 0.0652, "step": 35530 }, { "epoch": 4.618583495776479, "grad_norm": 0.3410549759864807, "learning_rate": 3.8141650422352184e-06, "loss": 0.086, "step": 35540 }, { "epoch": 4.619883040935672, "grad_norm": 0.27334606647491455, "learning_rate": 3.8011695906432747e-06, "loss": 0.087, "step": 35550 }, { "epoch": 4.621182586094867, "grad_norm": 0.345345675945282, "learning_rate": 3.788174139051332e-06, "loss": 0.0913, "step": 35560 }, { "epoch": 4.622482131254061, "grad_norm": 0.44409501552581787, "learning_rate": 3.7751786874593893e-06, "loss": 0.1442, "step": 35570 }, { "epoch": 4.623781676413255, "grad_norm": 1.2246148586273193, "learning_rate": 3.762183235867447e-06, "loss": 0.1203, "step": 35580 }, { "epoch": 4.62508122157245, "grad_norm": 0.4496554136276245, "learning_rate": 3.7491877842755036e-06, "loss": 0.127, "step": 35590 }, { "epoch": 4.626380766731644, "grad_norm": 0.06828044354915619, "learning_rate": 3.7361923326835607e-06, "loss": 0.0544, "step": 35600 }, { "epoch": 4.627680311890838, "grad_norm": 0.6323972940444946, "learning_rate": 3.723196881091618e-06, "loss": 0.1045, "step": 35610 }, { "epoch": 4.628979857050032, "grad_norm": 1.267647624015808, "learning_rate": 3.7102014294996754e-06, "loss": 0.0886, "step": 35620 }, { "epoch": 4.630279402209227, "grad_norm": 0.7860323190689087, "learning_rate": 3.6972059779077325e-06, "loss": 0.1111, "step": 35630 }, { "epoch": 4.631578947368421, "grad_norm": 0.2707422375679016, "learning_rate": 3.6842105263157892e-06, "loss": 0.1327, "step": 35640 }, { "epoch": 4.632878492527615, "grad_norm": 0.22505614161491394, "learning_rate": 3.6712150747238468e-06, "loss": 0.1132, "step": 35650 }, { "epoch": 4.63417803768681, "grad_norm": 0.4877300560474396, "learning_rate": 3.658219623131904e-06, "loss": 0.1015, "step": 35660 }, { "epoch": 4.635477582846004, "grad_norm": 0.30340373516082764, "learning_rate": 3.6452241715399614e-06, "loss": 0.1146, "step": 35670 }, { "epoch": 4.636777128005198, "grad_norm": 0.4295879900455475, "learning_rate": 3.6322287199480186e-06, "loss": 0.1194, "step": 35680 }, { "epoch": 4.6380766731643925, "grad_norm": 0.20829448103904724, "learning_rate": 3.6192332683560753e-06, "loss": 0.0857, "step": 35690 }, { "epoch": 4.639376218323587, "grad_norm": 0.5847800970077515, "learning_rate": 3.6062378167641324e-06, "loss": 0.064, "step": 35700 }, { "epoch": 4.640675763482781, "grad_norm": 0.3051070272922516, "learning_rate": 3.59324236517219e-06, "loss": 0.0975, "step": 35710 }, { "epoch": 4.6419753086419755, "grad_norm": 0.7554360032081604, "learning_rate": 3.580246913580247e-06, "loss": 0.0845, "step": 35720 }, { "epoch": 4.643274853801169, "grad_norm": 0.336065411567688, "learning_rate": 3.5672514619883046e-06, "loss": 0.1289, "step": 35730 }, { "epoch": 4.644574398960364, "grad_norm": 1.0440149307250977, "learning_rate": 3.554256010396361e-06, "loss": 0.121, "step": 35740 }, { "epoch": 4.6458739441195585, "grad_norm": 0.5044328570365906, "learning_rate": 3.5412605588044185e-06, "loss": 0.114, "step": 35750 }, { "epoch": 4.647173489278752, "grad_norm": 0.5253631472587585, "learning_rate": 3.5282651072124756e-06, "loss": 0.0946, "step": 35760 }, { "epoch": 4.648473034437947, "grad_norm": 0.34100621938705444, "learning_rate": 3.515269655620533e-06, "loss": 0.1458, "step": 35770 }, { "epoch": 4.6497725795971405, "grad_norm": 0.41082102060317993, "learning_rate": 3.5022742040285903e-06, "loss": 0.1252, "step": 35780 }, { "epoch": 4.651072124756335, "grad_norm": 0.6146283149719238, "learning_rate": 3.489278752436647e-06, "loss": 0.158, "step": 35790 }, { "epoch": 4.65237166991553, "grad_norm": 0.1624896228313446, "learning_rate": 3.4762833008447045e-06, "loss": 0.1378, "step": 35800 }, { "epoch": 4.6536712150747235, "grad_norm": 0.8832399845123291, "learning_rate": 3.4632878492527616e-06, "loss": 0.1136, "step": 35810 }, { "epoch": 4.654970760233918, "grad_norm": 0.7435377240180969, "learning_rate": 3.4502923976608188e-06, "loss": 0.1833, "step": 35820 }, { "epoch": 4.656270305393113, "grad_norm": 0.8314655423164368, "learning_rate": 3.4372969460688763e-06, "loss": 0.1129, "step": 35830 }, { "epoch": 4.657569850552306, "grad_norm": 0.24939680099487305, "learning_rate": 3.424301494476933e-06, "loss": 0.1501, "step": 35840 }, { "epoch": 4.658869395711501, "grad_norm": 0.09438829869031906, "learning_rate": 3.41130604288499e-06, "loss": 0.0833, "step": 35850 }, { "epoch": 4.660168940870696, "grad_norm": 1.198899745941162, "learning_rate": 3.3983105912930477e-06, "loss": 0.1489, "step": 35860 }, { "epoch": 4.661468486029889, "grad_norm": 0.21109728515148163, "learning_rate": 3.385315139701105e-06, "loss": 0.0881, "step": 35870 }, { "epoch": 4.662768031189084, "grad_norm": 0.4084257185459137, "learning_rate": 3.3723196881091624e-06, "loss": 0.1582, "step": 35880 }, { "epoch": 4.664067576348278, "grad_norm": 0.6371760964393616, "learning_rate": 3.3593242365172186e-06, "loss": 0.1155, "step": 35890 }, { "epoch": 4.665367121507472, "grad_norm": 0.18256497383117676, "learning_rate": 3.346328784925276e-06, "loss": 0.1367, "step": 35900 }, { "epoch": 4.666666666666667, "grad_norm": 0.7191747426986694, "learning_rate": 3.3333333333333333e-06, "loss": 0.1303, "step": 35910 }, { "epoch": 4.667966211825861, "grad_norm": 0.5962021946907043, "learning_rate": 3.320337881741391e-06, "loss": 0.0944, "step": 35920 }, { "epoch": 4.669265756985055, "grad_norm": 1.8006960153579712, "learning_rate": 3.307342430149448e-06, "loss": 0.1519, "step": 35930 }, { "epoch": 4.670565302144249, "grad_norm": 0.30540063977241516, "learning_rate": 3.2943469785575047e-06, "loss": 0.127, "step": 35940 }, { "epoch": 4.671864847303444, "grad_norm": 0.1880054771900177, "learning_rate": 3.281351526965562e-06, "loss": 0.0781, "step": 35950 }, { "epoch": 4.673164392462638, "grad_norm": 0.19967949390411377, "learning_rate": 3.2683560753736194e-06, "loss": 0.1355, "step": 35960 }, { "epoch": 4.674463937621832, "grad_norm": 0.9759343266487122, "learning_rate": 3.2553606237816765e-06, "loss": 0.1366, "step": 35970 }, { "epoch": 4.675763482781027, "grad_norm": 0.4397076368331909, "learning_rate": 3.242365172189734e-06, "loss": 0.1137, "step": 35980 }, { "epoch": 4.677063027940221, "grad_norm": 0.901160478591919, "learning_rate": 3.2293697205977908e-06, "loss": 0.0745, "step": 35990 }, { "epoch": 4.678362573099415, "grad_norm": 0.31183236837387085, "learning_rate": 3.216374269005848e-06, "loss": 0.1448, "step": 36000 }, { "epoch": 4.67966211825861, "grad_norm": 0.42338165640830994, "learning_rate": 3.2033788174139054e-06, "loss": 0.1612, "step": 36010 }, { "epoch": 4.680961663417804, "grad_norm": 0.6426999568939209, "learning_rate": 3.1903833658219626e-06, "loss": 0.1055, "step": 36020 }, { "epoch": 4.682261208576998, "grad_norm": 0.4059343636035919, "learning_rate": 3.1773879142300197e-06, "loss": 0.1235, "step": 36030 }, { "epoch": 4.6835607537361925, "grad_norm": 0.2965996563434601, "learning_rate": 3.1643924626380764e-06, "loss": 0.0992, "step": 36040 }, { "epoch": 4.684860298895386, "grad_norm": 0.2280224710702896, "learning_rate": 3.151397011046134e-06, "loss": 0.1096, "step": 36050 }, { "epoch": 4.686159844054581, "grad_norm": 0.2991485297679901, "learning_rate": 3.138401559454191e-06, "loss": 0.0947, "step": 36060 }, { "epoch": 4.6874593892137755, "grad_norm": 0.34488895535469055, "learning_rate": 3.1254061078622486e-06, "loss": 0.0907, "step": 36070 }, { "epoch": 4.688758934372969, "grad_norm": 0.4160277545452118, "learning_rate": 3.1124106562703053e-06, "loss": 0.1026, "step": 36080 }, { "epoch": 4.690058479532164, "grad_norm": 1.1387712955474854, "learning_rate": 3.099415204678363e-06, "loss": 0.1278, "step": 36090 }, { "epoch": 4.6913580246913575, "grad_norm": 0.5861008763313293, "learning_rate": 3.0864197530864196e-06, "loss": 0.1146, "step": 36100 }, { "epoch": 4.692657569850552, "grad_norm": 0.3389401435852051, "learning_rate": 3.073424301494477e-06, "loss": 0.1013, "step": 36110 }, { "epoch": 4.693957115009747, "grad_norm": 0.3304878771305084, "learning_rate": 3.0604288499025342e-06, "loss": 0.111, "step": 36120 }, { "epoch": 4.6952566601689405, "grad_norm": 0.49568626284599304, "learning_rate": 3.0474333983105914e-06, "loss": 0.1459, "step": 36130 }, { "epoch": 4.696556205328135, "grad_norm": 0.434758722782135, "learning_rate": 3.0344379467186485e-06, "loss": 0.0952, "step": 36140 }, { "epoch": 4.69785575048733, "grad_norm": 0.22243529558181763, "learning_rate": 3.0214424951267056e-06, "loss": 0.0856, "step": 36150 }, { "epoch": 4.6991552956465235, "grad_norm": 0.2957035303115845, "learning_rate": 3.0084470435347628e-06, "loss": 0.1152, "step": 36160 }, { "epoch": 4.700454840805718, "grad_norm": 0.3574669063091278, "learning_rate": 2.9954515919428203e-06, "loss": 0.0914, "step": 36170 }, { "epoch": 4.701754385964913, "grad_norm": 0.8645700216293335, "learning_rate": 2.9824561403508774e-06, "loss": 0.1385, "step": 36180 }, { "epoch": 4.703053931124106, "grad_norm": 0.2883896231651306, "learning_rate": 2.9694606887589346e-06, "loss": 0.1233, "step": 36190 }, { "epoch": 4.704353476283301, "grad_norm": 0.1798774152994156, "learning_rate": 2.9564652371669917e-06, "loss": 0.1505, "step": 36200 }, { "epoch": 4.705653021442495, "grad_norm": 0.5426915287971497, "learning_rate": 2.943469785575049e-06, "loss": 0.0806, "step": 36210 }, { "epoch": 4.706952566601689, "grad_norm": 0.240564227104187, "learning_rate": 2.9304743339831064e-06, "loss": 0.1221, "step": 36220 }, { "epoch": 4.708252111760884, "grad_norm": 0.3779265284538269, "learning_rate": 2.917478882391163e-06, "loss": 0.1272, "step": 36230 }, { "epoch": 4.709551656920078, "grad_norm": 0.7420450448989868, "learning_rate": 2.9044834307992206e-06, "loss": 0.092, "step": 36240 }, { "epoch": 4.710851202079272, "grad_norm": 0.596621572971344, "learning_rate": 2.8914879792072773e-06, "loss": 0.0936, "step": 36250 }, { "epoch": 4.712150747238466, "grad_norm": 0.3556285500526428, "learning_rate": 2.878492527615335e-06, "loss": 0.0983, "step": 36260 }, { "epoch": 4.713450292397661, "grad_norm": 0.8812591433525085, "learning_rate": 2.865497076023392e-06, "loss": 0.0947, "step": 36270 }, { "epoch": 4.714749837556855, "grad_norm": 0.2138938456773758, "learning_rate": 2.852501624431449e-06, "loss": 0.1424, "step": 36280 }, { "epoch": 4.716049382716049, "grad_norm": 0.7849701642990112, "learning_rate": 2.8395061728395062e-06, "loss": 0.0781, "step": 36290 }, { "epoch": 4.717348927875244, "grad_norm": 0.8921722769737244, "learning_rate": 2.8265107212475634e-06, "loss": 0.1397, "step": 36300 }, { "epoch": 4.718648473034438, "grad_norm": 0.6744920015335083, "learning_rate": 2.8135152696556205e-06, "loss": 0.1032, "step": 36310 }, { "epoch": 4.719948018193632, "grad_norm": 0.2917276918888092, "learning_rate": 2.800519818063678e-06, "loss": 0.104, "step": 36320 }, { "epoch": 4.721247563352827, "grad_norm": 0.3741821348667145, "learning_rate": 2.7875243664717347e-06, "loss": 0.1219, "step": 36330 }, { "epoch": 4.722547108512021, "grad_norm": 0.6731634140014648, "learning_rate": 2.7745289148797923e-06, "loss": 0.1216, "step": 36340 }, { "epoch": 4.723846653671215, "grad_norm": 0.10997895896434784, "learning_rate": 2.7615334632878494e-06, "loss": 0.0927, "step": 36350 }, { "epoch": 4.7251461988304095, "grad_norm": 0.2928926646709442, "learning_rate": 2.7485380116959066e-06, "loss": 0.102, "step": 36360 }, { "epoch": 4.726445743989603, "grad_norm": 0.3841615319252014, "learning_rate": 2.7355425601039637e-06, "loss": 0.0869, "step": 36370 }, { "epoch": 4.727745289148798, "grad_norm": 0.8051868677139282, "learning_rate": 2.722547108512021e-06, "loss": 0.0785, "step": 36380 }, { "epoch": 4.7290448343079925, "grad_norm": 0.7272079586982727, "learning_rate": 2.7095516569200784e-06, "loss": 0.0959, "step": 36390 }, { "epoch": 4.730344379467186, "grad_norm": 1.1180084943771362, "learning_rate": 2.696556205328135e-06, "loss": 0.2082, "step": 36400 }, { "epoch": 4.731643924626381, "grad_norm": 1.5352845191955566, "learning_rate": 2.6835607537361926e-06, "loss": 0.1328, "step": 36410 }, { "epoch": 4.732943469785575, "grad_norm": 0.6514154076576233, "learning_rate": 2.6705653021442497e-06, "loss": 0.1444, "step": 36420 }, { "epoch": 4.734243014944769, "grad_norm": 0.1705578863620758, "learning_rate": 2.657569850552307e-06, "loss": 0.133, "step": 36430 }, { "epoch": 4.735542560103964, "grad_norm": 1.2109134197235107, "learning_rate": 2.644574398960364e-06, "loss": 0.1047, "step": 36440 }, { "epoch": 4.7368421052631575, "grad_norm": 0.526309072971344, "learning_rate": 2.631578947368421e-06, "loss": 0.0961, "step": 36450 }, { "epoch": 4.738141650422352, "grad_norm": 0.3482514023780823, "learning_rate": 2.6185834957764782e-06, "loss": 0.136, "step": 36460 }, { "epoch": 4.739441195581547, "grad_norm": 0.4405059218406677, "learning_rate": 2.6055880441845358e-06, "loss": 0.1099, "step": 36470 }, { "epoch": 4.7407407407407405, "grad_norm": 0.21282649040222168, "learning_rate": 2.5925925925925925e-06, "loss": 0.0768, "step": 36480 }, { "epoch": 4.742040285899935, "grad_norm": 0.31197813153266907, "learning_rate": 2.57959714100065e-06, "loss": 0.1842, "step": 36490 }, { "epoch": 4.74333983105913, "grad_norm": 0.8586112260818481, "learning_rate": 2.5666016894087067e-06, "loss": 0.0813, "step": 36500 }, { "epoch": 4.744639376218323, "grad_norm": 1.1909587383270264, "learning_rate": 2.5536062378167643e-06, "loss": 0.1544, "step": 36510 }, { "epoch": 4.745938921377518, "grad_norm": 0.3050549328327179, "learning_rate": 2.5406107862248214e-06, "loss": 0.1134, "step": 36520 }, { "epoch": 4.747238466536712, "grad_norm": 0.47882163524627686, "learning_rate": 2.5276153346328785e-06, "loss": 0.108, "step": 36530 }, { "epoch": 4.748538011695906, "grad_norm": 0.7300613522529602, "learning_rate": 2.5146198830409357e-06, "loss": 0.1127, "step": 36540 }, { "epoch": 4.749837556855101, "grad_norm": 0.36268627643585205, "learning_rate": 2.501624431448993e-06, "loss": 0.0957, "step": 36550 }, { "epoch": 4.751137102014295, "grad_norm": 0.16581828892230988, "learning_rate": 2.4886289798570503e-06, "loss": 0.1148, "step": 36560 }, { "epoch": 4.752436647173489, "grad_norm": 0.4425658583641052, "learning_rate": 2.4756335282651075e-06, "loss": 0.1035, "step": 36570 }, { "epoch": 4.753736192332683, "grad_norm": 0.42725634574890137, "learning_rate": 2.4626380766731646e-06, "loss": 0.1416, "step": 36580 }, { "epoch": 4.755035737491878, "grad_norm": 0.8429345488548279, "learning_rate": 2.4496426250812217e-06, "loss": 0.1422, "step": 36590 }, { "epoch": 4.756335282651072, "grad_norm": 0.21577255427837372, "learning_rate": 2.436647173489279e-06, "loss": 0.1023, "step": 36600 }, { "epoch": 4.757634827810266, "grad_norm": 0.5349097847938538, "learning_rate": 2.423651721897336e-06, "loss": 0.121, "step": 36610 }, { "epoch": 4.758934372969461, "grad_norm": 0.8335732221603394, "learning_rate": 2.4106562703053935e-06, "loss": 0.1226, "step": 36620 }, { "epoch": 4.760233918128655, "grad_norm": 0.33164578676223755, "learning_rate": 2.3976608187134502e-06, "loss": 0.1012, "step": 36630 }, { "epoch": 4.761533463287849, "grad_norm": 0.5284934639930725, "learning_rate": 2.3846653671215078e-06, "loss": 0.113, "step": 36640 }, { "epoch": 4.762833008447044, "grad_norm": 0.6982039213180542, "learning_rate": 2.3716699155295645e-06, "loss": 0.1149, "step": 36650 }, { "epoch": 4.764132553606238, "grad_norm": 0.6493982672691345, "learning_rate": 2.358674463937622e-06, "loss": 0.1, "step": 36660 }, { "epoch": 4.765432098765432, "grad_norm": 1.1599425077438354, "learning_rate": 2.345679012345679e-06, "loss": 0.121, "step": 36670 }, { "epoch": 4.766731643924627, "grad_norm": 0.6410452127456665, "learning_rate": 2.3326835607537363e-06, "loss": 0.1192, "step": 36680 }, { "epoch": 4.76803118908382, "grad_norm": 0.3141563832759857, "learning_rate": 2.3196881091617934e-06, "loss": 0.1214, "step": 36690 }, { "epoch": 4.769330734243015, "grad_norm": 0.1804049164056778, "learning_rate": 2.3066926575698505e-06, "loss": 0.0962, "step": 36700 }, { "epoch": 4.7706302794022095, "grad_norm": 0.3725380003452301, "learning_rate": 2.2936972059779077e-06, "loss": 0.0707, "step": 36710 }, { "epoch": 4.771929824561403, "grad_norm": 0.28083404898643494, "learning_rate": 2.2807017543859652e-06, "loss": 0.1164, "step": 36720 }, { "epoch": 4.773229369720598, "grad_norm": 0.5185171365737915, "learning_rate": 2.2677063027940223e-06, "loss": 0.1181, "step": 36730 }, { "epoch": 4.774528914879792, "grad_norm": 0.6880064010620117, "learning_rate": 2.2547108512020795e-06, "loss": 0.1, "step": 36740 }, { "epoch": 4.775828460038986, "grad_norm": 0.2452654391527176, "learning_rate": 2.2417153996101366e-06, "loss": 0.1015, "step": 36750 }, { "epoch": 4.777128005198181, "grad_norm": 0.6725175380706787, "learning_rate": 2.2287199480181937e-06, "loss": 0.1569, "step": 36760 }, { "epoch": 4.7784275503573745, "grad_norm": 0.17078347504138947, "learning_rate": 2.2157244964262513e-06, "loss": 0.0914, "step": 36770 }, { "epoch": 4.779727095516569, "grad_norm": 0.15884675085544586, "learning_rate": 2.202729044834308e-06, "loss": 0.0992, "step": 36780 }, { "epoch": 4.781026640675764, "grad_norm": 0.934307873249054, "learning_rate": 2.1897335932423655e-06, "loss": 0.1158, "step": 36790 }, { "epoch": 4.7823261858349575, "grad_norm": 0.31209903955459595, "learning_rate": 2.1767381416504222e-06, "loss": 0.0863, "step": 36800 }, { "epoch": 4.783625730994152, "grad_norm": 0.4262753427028656, "learning_rate": 2.1637426900584798e-06, "loss": 0.0879, "step": 36810 }, { "epoch": 4.784925276153347, "grad_norm": 0.11666608601808548, "learning_rate": 2.150747238466537e-06, "loss": 0.0898, "step": 36820 }, { "epoch": 4.7862248213125405, "grad_norm": 0.31528669595718384, "learning_rate": 2.137751786874594e-06, "loss": 0.1053, "step": 36830 }, { "epoch": 4.787524366471735, "grad_norm": 1.1958028078079224, "learning_rate": 2.124756335282651e-06, "loss": 0.1755, "step": 36840 }, { "epoch": 4.788823911630929, "grad_norm": 1.0496412515640259, "learning_rate": 2.1117608836907083e-06, "loss": 0.1434, "step": 36850 }, { "epoch": 4.790123456790123, "grad_norm": 0.39207276701927185, "learning_rate": 2.0987654320987654e-06, "loss": 0.1024, "step": 36860 }, { "epoch": 4.791423001949318, "grad_norm": 0.41359132528305054, "learning_rate": 2.085769980506823e-06, "loss": 0.0843, "step": 36870 }, { "epoch": 4.792722547108512, "grad_norm": 0.6467024087905884, "learning_rate": 2.0727745289148797e-06, "loss": 0.121, "step": 36880 }, { "epoch": 4.794022092267706, "grad_norm": 0.4730730354785919, "learning_rate": 2.0597790773229372e-06, "loss": 0.106, "step": 36890 }, { "epoch": 4.7953216374269, "grad_norm": 0.2365313470363617, "learning_rate": 2.0467836257309943e-06, "loss": 0.108, "step": 36900 }, { "epoch": 4.796621182586095, "grad_norm": 0.29800665378570557, "learning_rate": 2.0337881741390515e-06, "loss": 0.0888, "step": 36910 }, { "epoch": 4.797920727745289, "grad_norm": 1.4567769765853882, "learning_rate": 2.0207927225471086e-06, "loss": 0.1174, "step": 36920 }, { "epoch": 4.799220272904483, "grad_norm": 0.3685609698295593, "learning_rate": 2.0077972709551657e-06, "loss": 0.1401, "step": 36930 }, { "epoch": 4.800519818063678, "grad_norm": 0.4544294774532318, "learning_rate": 1.9948018193632233e-06, "loss": 0.126, "step": 36940 }, { "epoch": 4.801819363222872, "grad_norm": 0.7256747484207153, "learning_rate": 1.98180636777128e-06, "loss": 0.1361, "step": 36950 }, { "epoch": 4.803118908382066, "grad_norm": 0.23361191153526306, "learning_rate": 1.9688109161793375e-06, "loss": 0.1219, "step": 36960 }, { "epoch": 4.804418453541261, "grad_norm": 0.3115857243537903, "learning_rate": 1.9558154645873946e-06, "loss": 0.1031, "step": 36970 }, { "epoch": 4.805717998700455, "grad_norm": 1.3264601230621338, "learning_rate": 1.9428200129954518e-06, "loss": 0.1582, "step": 36980 }, { "epoch": 4.807017543859649, "grad_norm": 0.4758007228374481, "learning_rate": 1.929824561403509e-06, "loss": 0.1154, "step": 36990 }, { "epoch": 4.808317089018844, "grad_norm": 0.5160518884658813, "learning_rate": 1.916829109811566e-06, "loss": 0.0838, "step": 37000 }, { "epoch": 4.809616634178037, "grad_norm": 0.6937558650970459, "learning_rate": 1.9038336582196232e-06, "loss": 0.1899, "step": 37010 }, { "epoch": 4.810916179337232, "grad_norm": 0.3148116171360016, "learning_rate": 1.8908382066276805e-06, "loss": 0.0666, "step": 37020 }, { "epoch": 4.8122157244964265, "grad_norm": 0.551135778427124, "learning_rate": 1.8778427550357376e-06, "loss": 0.1046, "step": 37030 }, { "epoch": 4.81351526965562, "grad_norm": 1.0834245681762695, "learning_rate": 1.864847303443795e-06, "loss": 0.0776, "step": 37040 }, { "epoch": 4.814814814814815, "grad_norm": 0.5557347536087036, "learning_rate": 1.8518518518518519e-06, "loss": 0.0725, "step": 37050 }, { "epoch": 4.816114359974009, "grad_norm": 0.44041913747787476, "learning_rate": 1.8388564002599092e-06, "loss": 0.0931, "step": 37060 }, { "epoch": 4.817413905133203, "grad_norm": 0.9333682656288147, "learning_rate": 1.8258609486679661e-06, "loss": 0.0962, "step": 37070 }, { "epoch": 4.818713450292398, "grad_norm": 0.32290008664131165, "learning_rate": 1.8128654970760235e-06, "loss": 0.085, "step": 37080 }, { "epoch": 4.820012995451592, "grad_norm": 0.3950664699077606, "learning_rate": 1.7998700454840808e-06, "loss": 0.2867, "step": 37090 }, { "epoch": 4.821312540610786, "grad_norm": 0.8557788729667664, "learning_rate": 1.7868745938921377e-06, "loss": 0.0943, "step": 37100 }, { "epoch": 4.822612085769981, "grad_norm": 1.0689791440963745, "learning_rate": 1.773879142300195e-06, "loss": 0.1567, "step": 37110 }, { "epoch": 4.8239116309291745, "grad_norm": 0.770615816116333, "learning_rate": 1.760883690708252e-06, "loss": 0.0995, "step": 37120 }, { "epoch": 4.825211176088369, "grad_norm": 1.4544390439987183, "learning_rate": 1.7478882391163093e-06, "loss": 0.0848, "step": 37130 }, { "epoch": 4.826510721247564, "grad_norm": 0.2871391177177429, "learning_rate": 1.7348927875243666e-06, "loss": 0.0886, "step": 37140 }, { "epoch": 4.8278102664067575, "grad_norm": 0.38072994351387024, "learning_rate": 1.7218973359324236e-06, "loss": 0.1113, "step": 37150 }, { "epoch": 4.829109811565952, "grad_norm": 0.8304562568664551, "learning_rate": 1.708901884340481e-06, "loss": 0.1212, "step": 37160 }, { "epoch": 4.830409356725146, "grad_norm": 0.24211692810058594, "learning_rate": 1.695906432748538e-06, "loss": 0.0788, "step": 37170 }, { "epoch": 4.83170890188434, "grad_norm": 0.3934166431427002, "learning_rate": 1.6829109811565951e-06, "loss": 0.1051, "step": 37180 }, { "epoch": 4.833008447043535, "grad_norm": 0.7475360035896301, "learning_rate": 1.6699155295646525e-06, "loss": 0.1242, "step": 37190 }, { "epoch": 4.834307992202729, "grad_norm": 0.25333911180496216, "learning_rate": 1.6569200779727096e-06, "loss": 0.1071, "step": 37200 }, { "epoch": 4.835607537361923, "grad_norm": 0.7854872345924377, "learning_rate": 1.643924626380767e-06, "loss": 0.1297, "step": 37210 }, { "epoch": 4.836907082521117, "grad_norm": 0.285480260848999, "learning_rate": 1.6309291747888239e-06, "loss": 0.1424, "step": 37220 }, { "epoch": 4.838206627680312, "grad_norm": 0.22117172181606293, "learning_rate": 1.6179337231968812e-06, "loss": 0.1011, "step": 37230 }, { "epoch": 4.839506172839506, "grad_norm": 0.9053769707679749, "learning_rate": 1.6049382716049385e-06, "loss": 0.1409, "step": 37240 }, { "epoch": 4.8408057179987, "grad_norm": 0.5500437021255493, "learning_rate": 1.5919428200129955e-06, "loss": 0.0955, "step": 37250 }, { "epoch": 4.842105263157895, "grad_norm": 0.5661344528198242, "learning_rate": 1.5789473684210528e-06, "loss": 0.0739, "step": 37260 }, { "epoch": 4.843404808317089, "grad_norm": 0.17742320895195007, "learning_rate": 1.5659519168291097e-06, "loss": 0.1037, "step": 37270 }, { "epoch": 4.844704353476283, "grad_norm": 0.31380435824394226, "learning_rate": 1.552956465237167e-06, "loss": 0.1118, "step": 37280 }, { "epoch": 4.846003898635478, "grad_norm": 0.7698327898979187, "learning_rate": 1.5399610136452242e-06, "loss": 0.1148, "step": 37290 }, { "epoch": 4.847303443794672, "grad_norm": 0.44763892889022827, "learning_rate": 1.5269655620532813e-06, "loss": 0.1457, "step": 37300 }, { "epoch": 4.848602988953866, "grad_norm": 0.46338340640068054, "learning_rate": 1.5139701104613386e-06, "loss": 0.1041, "step": 37310 }, { "epoch": 4.849902534113061, "grad_norm": 0.19905243813991547, "learning_rate": 1.5009746588693958e-06, "loss": 0.1003, "step": 37320 }, { "epoch": 4.851202079272254, "grad_norm": 2.2512524127960205, "learning_rate": 1.4879792072774529e-06, "loss": 0.1747, "step": 37330 }, { "epoch": 4.852501624431449, "grad_norm": 0.265305757522583, "learning_rate": 1.47498375568551e-06, "loss": 0.0701, "step": 37340 }, { "epoch": 4.853801169590644, "grad_norm": 0.553920567035675, "learning_rate": 1.4619883040935671e-06, "loss": 0.1045, "step": 37350 }, { "epoch": 4.855100714749837, "grad_norm": 0.25853851437568665, "learning_rate": 1.4489928525016245e-06, "loss": 0.1174, "step": 37360 }, { "epoch": 4.856400259909032, "grad_norm": 0.5233213305473328, "learning_rate": 1.4359974009096816e-06, "loss": 0.0904, "step": 37370 }, { "epoch": 4.857699805068226, "grad_norm": 0.4407787024974823, "learning_rate": 1.423001949317739e-06, "loss": 0.1308, "step": 37380 }, { "epoch": 4.85899935022742, "grad_norm": 0.7992346286773682, "learning_rate": 1.410006497725796e-06, "loss": 0.1494, "step": 37390 }, { "epoch": 4.860298895386615, "grad_norm": 0.7893975973129272, "learning_rate": 1.3970110461338532e-06, "loss": 0.1437, "step": 37400 }, { "epoch": 4.861598440545809, "grad_norm": 0.8524181246757507, "learning_rate": 1.3840155945419105e-06, "loss": 0.1378, "step": 37410 }, { "epoch": 4.862897985705003, "grad_norm": 0.8182418942451477, "learning_rate": 1.3710201429499677e-06, "loss": 0.1172, "step": 37420 }, { "epoch": 4.864197530864198, "grad_norm": 0.4462776482105255, "learning_rate": 1.3580246913580248e-06, "loss": 0.1306, "step": 37430 }, { "epoch": 4.8654970760233915, "grad_norm": 0.8014011383056641, "learning_rate": 1.345029239766082e-06, "loss": 0.1096, "step": 37440 }, { "epoch": 4.866796621182586, "grad_norm": 0.6478471159934998, "learning_rate": 1.332033788174139e-06, "loss": 0.0889, "step": 37450 }, { "epoch": 4.868096166341781, "grad_norm": 0.37283000349998474, "learning_rate": 1.3190383365821964e-06, "loss": 0.0986, "step": 37460 }, { "epoch": 4.8693957115009745, "grad_norm": 0.6830462217330933, "learning_rate": 1.3060428849902535e-06, "loss": 0.1427, "step": 37470 }, { "epoch": 4.870695256660169, "grad_norm": 0.1349850744009018, "learning_rate": 1.2930474333983106e-06, "loss": 0.0718, "step": 37480 }, { "epoch": 4.871994801819363, "grad_norm": 0.19230441749095917, "learning_rate": 1.2800519818063678e-06, "loss": 0.0872, "step": 37490 }, { "epoch": 4.8732943469785575, "grad_norm": 0.2903516888618469, "learning_rate": 1.2670565302144249e-06, "loss": 0.1045, "step": 37500 }, { "epoch": 4.874593892137752, "grad_norm": 0.21841681003570557, "learning_rate": 1.2540610786224822e-06, "loss": 0.1118, "step": 37510 }, { "epoch": 4.875893437296946, "grad_norm": 0.294327050447464, "learning_rate": 1.2410656270305394e-06, "loss": 0.081, "step": 37520 }, { "epoch": 4.87719298245614, "grad_norm": 0.46652308106422424, "learning_rate": 1.2280701754385965e-06, "loss": 0.0892, "step": 37530 }, { "epoch": 4.878492527615334, "grad_norm": 0.4134247601032257, "learning_rate": 1.2150747238466536e-06, "loss": 0.1332, "step": 37540 }, { "epoch": 4.879792072774529, "grad_norm": 0.4954071044921875, "learning_rate": 1.202079272254711e-06, "loss": 0.096, "step": 37550 }, { "epoch": 4.881091617933723, "grad_norm": 0.26565831899642944, "learning_rate": 1.189083820662768e-06, "loss": 0.105, "step": 37560 }, { "epoch": 4.882391163092917, "grad_norm": 0.6494954228401184, "learning_rate": 1.1760883690708254e-06, "loss": 0.0921, "step": 37570 }, { "epoch": 4.883690708252112, "grad_norm": 0.2212117463350296, "learning_rate": 1.1630929174788825e-06, "loss": 0.1233, "step": 37580 }, { "epoch": 4.884990253411306, "grad_norm": 0.31680622696876526, "learning_rate": 1.1500974658869397e-06, "loss": 0.0817, "step": 37590 }, { "epoch": 4.8862897985705, "grad_norm": 0.2000313252210617, "learning_rate": 1.1371020142949968e-06, "loss": 0.1389, "step": 37600 }, { "epoch": 4.887589343729695, "grad_norm": 0.4626791477203369, "learning_rate": 1.1241065627030541e-06, "loss": 0.103, "step": 37610 }, { "epoch": 4.888888888888889, "grad_norm": 0.6757096648216248, "learning_rate": 1.1111111111111112e-06, "loss": 0.1097, "step": 37620 }, { "epoch": 4.890188434048083, "grad_norm": 0.11480850726366043, "learning_rate": 1.0981156595191684e-06, "loss": 0.1206, "step": 37630 }, { "epoch": 4.891487979207278, "grad_norm": 0.2784839868545532, "learning_rate": 1.0851202079272255e-06, "loss": 0.0783, "step": 37640 }, { "epoch": 4.892787524366471, "grad_norm": 0.5949620008468628, "learning_rate": 1.0721247563352826e-06, "loss": 0.1841, "step": 37650 }, { "epoch": 4.894087069525666, "grad_norm": 0.5146350860595703, "learning_rate": 1.05912930474334e-06, "loss": 0.0845, "step": 37660 }, { "epoch": 4.895386614684861, "grad_norm": 0.4033504128456116, "learning_rate": 1.046133853151397e-06, "loss": 0.1009, "step": 37670 }, { "epoch": 4.896686159844054, "grad_norm": 0.4812397360801697, "learning_rate": 1.0331384015594542e-06, "loss": 0.1574, "step": 37680 }, { "epoch": 4.897985705003249, "grad_norm": 0.8050066828727722, "learning_rate": 1.0201429499675113e-06, "loss": 0.0808, "step": 37690 }, { "epoch": 4.899285250162443, "grad_norm": 0.17372384667396545, "learning_rate": 1.0071474983755685e-06, "loss": 0.079, "step": 37700 }, { "epoch": 4.900584795321637, "grad_norm": 0.3835534453392029, "learning_rate": 9.941520467836258e-07, "loss": 0.1255, "step": 37710 }, { "epoch": 4.901884340480832, "grad_norm": 0.2864707410335541, "learning_rate": 9.81156595191683e-07, "loss": 0.1052, "step": 37720 }, { "epoch": 4.903183885640026, "grad_norm": 0.994801938533783, "learning_rate": 9.6816114359974e-07, "loss": 0.1331, "step": 37730 }, { "epoch": 4.90448343079922, "grad_norm": 0.3783752918243408, "learning_rate": 9.551656920077974e-07, "loss": 0.0885, "step": 37740 }, { "epoch": 4.905782975958415, "grad_norm": 0.17705276608467102, "learning_rate": 9.421702404158544e-07, "loss": 0.1095, "step": 37750 }, { "epoch": 4.907082521117609, "grad_norm": 0.3456631302833557, "learning_rate": 9.291747888239115e-07, "loss": 0.1756, "step": 37760 }, { "epoch": 4.908382066276803, "grad_norm": 0.4172511696815491, "learning_rate": 9.161793372319689e-07, "loss": 0.0841, "step": 37770 }, { "epoch": 4.909681611435998, "grad_norm": 0.6624168753623962, "learning_rate": 9.03183885640026e-07, "loss": 0.1655, "step": 37780 }, { "epoch": 4.9109811565951915, "grad_norm": 0.6527529358863831, "learning_rate": 8.901884340480832e-07, "loss": 0.1162, "step": 37790 }, { "epoch": 4.912280701754386, "grad_norm": 1.1891287565231323, "learning_rate": 8.771929824561404e-07, "loss": 0.187, "step": 37800 }, { "epoch": 4.91358024691358, "grad_norm": 0.8151688575744629, "learning_rate": 8.641975308641975e-07, "loss": 0.1241, "step": 37810 }, { "epoch": 4.9148797920727745, "grad_norm": 0.46518081426620483, "learning_rate": 8.512020792722548e-07, "loss": 0.1, "step": 37820 }, { "epoch": 4.916179337231969, "grad_norm": 0.4804568290710449, "learning_rate": 8.38206627680312e-07, "loss": 0.0921, "step": 37830 }, { "epoch": 4.917478882391163, "grad_norm": 0.2855057716369629, "learning_rate": 8.252111760883691e-07, "loss": 0.0852, "step": 37840 }, { "epoch": 4.918778427550357, "grad_norm": 0.7264400720596313, "learning_rate": 8.122157244964262e-07, "loss": 0.1423, "step": 37850 }, { "epoch": 4.920077972709551, "grad_norm": 0.4113144278526306, "learning_rate": 7.992202729044833e-07, "loss": 0.0875, "step": 37860 }, { "epoch": 4.921377517868746, "grad_norm": 0.349338561296463, "learning_rate": 7.862248213125407e-07, "loss": 0.0955, "step": 37870 }, { "epoch": 4.92267706302794, "grad_norm": 0.6202232837677002, "learning_rate": 7.732293697205978e-07, "loss": 0.0949, "step": 37880 }, { "epoch": 4.923976608187134, "grad_norm": 0.26609134674072266, "learning_rate": 7.60233918128655e-07, "loss": 0.1279, "step": 37890 }, { "epoch": 4.925276153346329, "grad_norm": 0.27525612711906433, "learning_rate": 7.472384665367122e-07, "loss": 0.1304, "step": 37900 }, { "epoch": 4.926575698505523, "grad_norm": 0.3350059688091278, "learning_rate": 7.342430149447694e-07, "loss": 0.129, "step": 37910 }, { "epoch": 4.927875243664717, "grad_norm": 0.512259304523468, "learning_rate": 7.212475633528265e-07, "loss": 0.1285, "step": 37920 }, { "epoch": 4.929174788823912, "grad_norm": 0.9909676909446716, "learning_rate": 7.082521117608838e-07, "loss": 0.1325, "step": 37930 }, { "epoch": 4.930474333983106, "grad_norm": 0.2793431282043457, "learning_rate": 6.952566601689409e-07, "loss": 0.1238, "step": 37940 }, { "epoch": 4.9317738791423, "grad_norm": 0.9151816368103027, "learning_rate": 6.82261208576998e-07, "loss": 0.0963, "step": 37950 }, { "epoch": 4.933073424301495, "grad_norm": 0.303997665643692, "learning_rate": 6.692657569850552e-07, "loss": 0.0983, "step": 37960 }, { "epoch": 4.934372969460688, "grad_norm": 0.29338085651397705, "learning_rate": 6.562703053931125e-07, "loss": 0.0985, "step": 37970 }, { "epoch": 4.935672514619883, "grad_norm": 0.1861090064048767, "learning_rate": 6.432748538011697e-07, "loss": 0.1046, "step": 37980 }, { "epoch": 4.936972059779078, "grad_norm": 0.7891502976417542, "learning_rate": 6.302794022092268e-07, "loss": 0.1344, "step": 37990 }, { "epoch": 4.938271604938271, "grad_norm": 0.9650517702102661, "learning_rate": 6.17283950617284e-07, "loss": 0.1333, "step": 38000 }, { "epoch": 4.939571150097466, "grad_norm": 0.5235655307769775, "learning_rate": 6.042884990253412e-07, "loss": 0.1448, "step": 38010 }, { "epoch": 4.94087069525666, "grad_norm": 0.4571723937988281, "learning_rate": 5.912930474333983e-07, "loss": 0.1159, "step": 38020 }, { "epoch": 4.942170240415854, "grad_norm": 0.3222062289714813, "learning_rate": 5.782975958414555e-07, "loss": 0.1174, "step": 38030 }, { "epoch": 4.943469785575049, "grad_norm": 0.4053152799606323, "learning_rate": 5.653021442495127e-07, "loss": 0.1073, "step": 38040 }, { "epoch": 4.944769330734243, "grad_norm": 0.17077328264713287, "learning_rate": 5.523066926575698e-07, "loss": 0.1076, "step": 38050 }, { "epoch": 4.946068875893437, "grad_norm": 0.28341540694236755, "learning_rate": 5.39311241065627e-07, "loss": 0.1464, "step": 38060 }, { "epoch": 4.947368421052632, "grad_norm": 0.4506392478942871, "learning_rate": 5.263157894736843e-07, "loss": 0.1252, "step": 38070 }, { "epoch": 4.948667966211826, "grad_norm": 0.5838063359260559, "learning_rate": 5.133203378817415e-07, "loss": 0.0958, "step": 38080 }, { "epoch": 4.94996751137102, "grad_norm": 0.2677607238292694, "learning_rate": 5.003248862897986e-07, "loss": 0.1713, "step": 38090 }, { "epoch": 4.951267056530215, "grad_norm": 0.2748105823993683, "learning_rate": 4.873294346978557e-07, "loss": 0.0684, "step": 38100 }, { "epoch": 4.9525666016894085, "grad_norm": 0.37390226125717163, "learning_rate": 4.74333983105913e-07, "loss": 0.1651, "step": 38110 }, { "epoch": 4.953866146848603, "grad_norm": 0.10736096650362015, "learning_rate": 4.613385315139701e-07, "loss": 0.0803, "step": 38120 }, { "epoch": 4.955165692007797, "grad_norm": 0.28304219245910645, "learning_rate": 4.483430799220273e-07, "loss": 0.1301, "step": 38130 }, { "epoch": 4.9564652371669915, "grad_norm": 0.4652070999145508, "learning_rate": 4.353476283300845e-07, "loss": 0.0882, "step": 38140 }, { "epoch": 4.957764782326186, "grad_norm": 0.8172307014465332, "learning_rate": 4.2235217673814165e-07, "loss": 0.0698, "step": 38150 }, { "epoch": 4.95906432748538, "grad_norm": 0.6324800848960876, "learning_rate": 4.093567251461989e-07, "loss": 0.0984, "step": 38160 }, { "epoch": 4.9603638726445745, "grad_norm": 0.16622844338417053, "learning_rate": 3.96361273554256e-07, "loss": 0.1042, "step": 38170 }, { "epoch": 4.961663417803768, "grad_norm": 0.8555071353912354, "learning_rate": 3.8336582196231324e-07, "loss": 0.0664, "step": 38180 }, { "epoch": 4.962962962962963, "grad_norm": 0.7191206812858582, "learning_rate": 3.703703703703704e-07, "loss": 0.0824, "step": 38190 }, { "epoch": 4.964262508122157, "grad_norm": 0.7104530930519104, "learning_rate": 3.5737491877842754e-07, "loss": 0.0973, "step": 38200 }, { "epoch": 4.965562053281351, "grad_norm": 0.7049599289894104, "learning_rate": 3.443794671864847e-07, "loss": 0.1149, "step": 38210 }, { "epoch": 4.966861598440546, "grad_norm": 0.5301273465156555, "learning_rate": 3.313840155945419e-07, "loss": 0.1595, "step": 38220 }, { "epoch": 4.96816114359974, "grad_norm": 0.32560163736343384, "learning_rate": 3.1838856400259913e-07, "loss": 0.1307, "step": 38230 }, { "epoch": 4.969460688758934, "grad_norm": 0.37497374415397644, "learning_rate": 3.053931124106563e-07, "loss": 0.0647, "step": 38240 }, { "epoch": 4.970760233918129, "grad_norm": 0.8456316590309143, "learning_rate": 2.9239766081871344e-07, "loss": 0.1233, "step": 38250 }, { "epoch": 4.972059779077323, "grad_norm": 0.15907758474349976, "learning_rate": 2.794022092267706e-07, "loss": 0.0658, "step": 38260 }, { "epoch": 4.973359324236517, "grad_norm": 0.38885951042175293, "learning_rate": 2.6640675763482785e-07, "loss": 0.0667, "step": 38270 }, { "epoch": 4.974658869395712, "grad_norm": 0.2685055136680603, "learning_rate": 2.5341130604288503e-07, "loss": 0.1048, "step": 38280 }, { "epoch": 4.975958414554905, "grad_norm": 0.4749066233634949, "learning_rate": 2.404158544509422e-07, "loss": 0.0732, "step": 38290 }, { "epoch": 4.9772579597141, "grad_norm": 0.7249954342842102, "learning_rate": 2.2742040285899934e-07, "loss": 0.1173, "step": 38300 }, { "epoch": 4.978557504873295, "grad_norm": 0.24232815206050873, "learning_rate": 2.1442495126705654e-07, "loss": 0.1307, "step": 38310 }, { "epoch": 4.979857050032488, "grad_norm": 0.3446873426437378, "learning_rate": 2.0142949967511372e-07, "loss": 0.0726, "step": 38320 }, { "epoch": 4.981156595191683, "grad_norm": 0.6538987755775452, "learning_rate": 1.884340480831709e-07, "loss": 0.0842, "step": 38330 }, { "epoch": 4.982456140350877, "grad_norm": 0.26396477222442627, "learning_rate": 1.7543859649122808e-07, "loss": 0.1178, "step": 38340 }, { "epoch": 4.983755685510071, "grad_norm": 0.8718075156211853, "learning_rate": 1.6244314489928526e-07, "loss": 0.1114, "step": 38350 }, { "epoch": 4.985055230669266, "grad_norm": 0.7346181869506836, "learning_rate": 1.4944769330734244e-07, "loss": 0.0844, "step": 38360 }, { "epoch": 4.98635477582846, "grad_norm": 0.44351983070373535, "learning_rate": 1.3645224171539962e-07, "loss": 0.1462, "step": 38370 }, { "epoch": 4.987654320987654, "grad_norm": 0.18780091404914856, "learning_rate": 1.234567901234568e-07, "loss": 0.0945, "step": 38380 }, { "epoch": 4.988953866146849, "grad_norm": 0.35770750045776367, "learning_rate": 1.1046133853151398e-07, "loss": 0.0853, "step": 38390 }, { "epoch": 4.990253411306043, "grad_norm": 0.5883762836456299, "learning_rate": 9.746588693957116e-08, "loss": 0.0816, "step": 38400 }, { "epoch": 4.991552956465237, "grad_norm": 0.24757328629493713, "learning_rate": 8.447043534762833e-08, "loss": 0.1374, "step": 38410 }, { "epoch": 4.992852501624432, "grad_norm": 0.557776927947998, "learning_rate": 7.147498375568551e-08, "loss": 0.1249, "step": 38420 }, { "epoch": 4.994152046783626, "grad_norm": 0.3505350649356842, "learning_rate": 5.847953216374269e-08, "loss": 0.1191, "step": 38430 }, { "epoch": 4.99545159194282, "grad_norm": 0.4326639473438263, "learning_rate": 4.548408057179987e-08, "loss": 0.1234, "step": 38440 }, { "epoch": 4.996751137102014, "grad_norm": 0.2102600485086441, "learning_rate": 3.248862897985705e-08, "loss": 0.0904, "step": 38450 }, { "epoch": 4.9980506822612085, "grad_norm": 0.39034053683280945, "learning_rate": 1.949317738791423e-08, "loss": 0.1255, "step": 38460 }, { "epoch": 4.999350227420403, "grad_norm": 0.5811758637428284, "learning_rate": 6.4977257959714095e-09, "loss": 0.0812, "step": 38470 }, { "epoch": 5.0, "eval_loss": 0.07974080741405487, "eval_runtime": 852.2757, "eval_samples_per_second": 9.029, "eval_steps_per_second": 9.029, "step": 38475 } ], "logging_steps": 10, "max_steps": 38475, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.7080625635328e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }