{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1650259218267607, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005825129609133803, "grad_norm": 15.917865753173828, "learning_rate": 1.8e-07, "loss": 0.8727, "step": 10 }, { "epoch": 0.0011650259218267606, "grad_norm": 14.31850814819336, "learning_rate": 3.8e-07, "loss": 0.9786, "step": 20 }, { "epoch": 0.001747538882740141, "grad_norm": 11.33942985534668, "learning_rate": 5.8e-07, "loss": 0.9023, "step": 30 }, { "epoch": 0.0023300518436535213, "grad_norm": 12.361573219299316, "learning_rate": 7.8e-07, "loss": 0.9143, "step": 40 }, { "epoch": 0.0029125648045669017, "grad_norm": 9.150689125061035, "learning_rate": 9.8e-07, "loss": 0.6634, "step": 50 }, { "epoch": 0.003495077765480282, "grad_norm": 4.736334800720215, "learning_rate": 1.18e-06, "loss": 0.5445, "step": 60 }, { "epoch": 0.004077590726393662, "grad_norm": 3.376814126968384, "learning_rate": 1.3800000000000001e-06, "loss": 0.3729, "step": 70 }, { "epoch": 0.0046601036873070425, "grad_norm": 1.6669825315475464, "learning_rate": 1.5800000000000003e-06, "loss": 0.3023, "step": 80 }, { "epoch": 0.005242616648220423, "grad_norm": 3.6736834049224854, "learning_rate": 1.7800000000000001e-06, "loss": 0.254, "step": 90 }, { "epoch": 0.005825129609133803, "grad_norm": 2.4123685359954834, "learning_rate": 1.98e-06, "loss": 0.2135, "step": 100 }, { "epoch": 0.006407642570047184, "grad_norm": 2.173173189163208, "learning_rate": 2.1800000000000003e-06, "loss": 0.1837, "step": 110 }, { "epoch": 0.006990155530960564, "grad_norm": 2.9002139568328857, "learning_rate": 2.38e-06, "loss": 0.1643, "step": 120 }, { "epoch": 0.007572668491873944, "grad_norm": 2.0415618419647217, "learning_rate": 2.5800000000000003e-06, "loss": 0.1547, "step": 130 }, { "epoch": 0.008155181452787324, "grad_norm": 1.617000937461853, "learning_rate": 2.78e-06, "loss": 0.139, "step": 140 }, { "epoch": 0.008737694413700705, "grad_norm": 1.8641277551651, "learning_rate": 2.9800000000000003e-06, "loss": 0.1215, "step": 150 }, { "epoch": 0.009320207374614085, "grad_norm": 1.6065384149551392, "learning_rate": 3.1800000000000005e-06, "loss": 0.1092, "step": 160 }, { "epoch": 0.009902720335527465, "grad_norm": 1.5726075172424316, "learning_rate": 3.38e-06, "loss": 0.1056, "step": 170 }, { "epoch": 0.010485233296440846, "grad_norm": 1.7243578433990479, "learning_rate": 3.58e-06, "loss": 0.0941, "step": 180 }, { "epoch": 0.011067746257354226, "grad_norm": 1.8743624687194824, "learning_rate": 3.7800000000000002e-06, "loss": 0.0954, "step": 190 }, { "epoch": 0.011650259218267607, "grad_norm": 1.4854791164398193, "learning_rate": 3.98e-06, "loss": 0.0894, "step": 200 }, { "epoch": 0.012232772179180987, "grad_norm": 1.6770410537719727, "learning_rate": 4.18e-06, "loss": 0.0965, "step": 210 }, { "epoch": 0.012815285140094368, "grad_norm": 1.3139946460723877, "learning_rate": 4.38e-06, "loss": 0.0937, "step": 220 }, { "epoch": 0.013397798101007748, "grad_norm": 1.5635442733764648, "learning_rate": 4.58e-06, "loss": 0.0746, "step": 230 }, { "epoch": 0.013980311061921128, "grad_norm": 2.741807222366333, "learning_rate": 4.780000000000001e-06, "loss": 0.0894, "step": 240 }, { "epoch": 0.014562824022834509, "grad_norm": 2.2682595252990723, "learning_rate": 4.98e-06, "loss": 0.0793, "step": 250 }, { "epoch": 0.015145336983747888, "grad_norm": 1.6816890239715576, "learning_rate": 5.18e-06, "loss": 0.0732, "step": 260 }, { "epoch": 0.015727849944661268, "grad_norm": 1.5313717126846313, "learning_rate": 5.38e-06, "loss": 0.0803, "step": 270 }, { "epoch": 0.01631036290557465, "grad_norm": 1.8341925144195557, "learning_rate": 5.580000000000001e-06, "loss": 0.0723, "step": 280 }, { "epoch": 0.01689287586648803, "grad_norm": 1.5430922508239746, "learning_rate": 5.78e-06, "loss": 0.0849, "step": 290 }, { "epoch": 0.01747538882740141, "grad_norm": 1.8703837394714355, "learning_rate": 5.98e-06, "loss": 0.0762, "step": 300 }, { "epoch": 0.01805790178831479, "grad_norm": 1.8380842208862305, "learning_rate": 6.18e-06, "loss": 0.0776, "step": 310 }, { "epoch": 0.01864041474922817, "grad_norm": 1.6887844800949097, "learning_rate": 6.38e-06, "loss": 0.0706, "step": 320 }, { "epoch": 0.01922292771014155, "grad_norm": 2.3623692989349365, "learning_rate": 6.58e-06, "loss": 0.0797, "step": 330 }, { "epoch": 0.01980544067105493, "grad_norm": 2.1725242137908936, "learning_rate": 6.78e-06, "loss": 0.0807, "step": 340 }, { "epoch": 0.02038795363196831, "grad_norm": 1.3125557899475098, "learning_rate": 6.98e-06, "loss": 0.0628, "step": 350 }, { "epoch": 0.020970466592881692, "grad_norm": 1.1529388427734375, "learning_rate": 7.180000000000001e-06, "loss": 0.0673, "step": 360 }, { "epoch": 0.021552979553795072, "grad_norm": 1.211694598197937, "learning_rate": 7.3800000000000005e-06, "loss": 0.0714, "step": 370 }, { "epoch": 0.022135492514708453, "grad_norm": 1.3804696798324585, "learning_rate": 7.580000000000001e-06, "loss": 0.056, "step": 380 }, { "epoch": 0.022718005475621833, "grad_norm": 1.4936906099319458, "learning_rate": 7.78e-06, "loss": 0.0669, "step": 390 }, { "epoch": 0.023300518436535213, "grad_norm": 1.6949350833892822, "learning_rate": 7.98e-06, "loss": 0.0612, "step": 400 }, { "epoch": 0.023883031397448594, "grad_norm": 1.7241305112838745, "learning_rate": 8.18e-06, "loss": 0.0631, "step": 410 }, { "epoch": 0.024465544358361974, "grad_norm": 1.2981735467910767, "learning_rate": 8.380000000000001e-06, "loss": 0.0595, "step": 420 }, { "epoch": 0.025048057319275355, "grad_norm": 1.3489582538604736, "learning_rate": 8.580000000000001e-06, "loss": 0.0702, "step": 430 }, { "epoch": 0.025630570280188735, "grad_norm": 1.6479697227478027, "learning_rate": 8.78e-06, "loss": 0.068, "step": 440 }, { "epoch": 0.026213083241102116, "grad_norm": 1.8191801309585571, "learning_rate": 8.98e-06, "loss": 0.0611, "step": 450 }, { "epoch": 0.026795596202015496, "grad_norm": 1.2770086526870728, "learning_rate": 9.180000000000002e-06, "loss": 0.0807, "step": 460 }, { "epoch": 0.027378109162928876, "grad_norm": 1.7458429336547852, "learning_rate": 9.38e-06, "loss": 0.0599, "step": 470 }, { "epoch": 0.027960622123842257, "grad_norm": 1.6553988456726074, "learning_rate": 9.58e-06, "loss": 0.0626, "step": 480 }, { "epoch": 0.028543135084755637, "grad_norm": 1.2745176553726196, "learning_rate": 9.78e-06, "loss": 0.0565, "step": 490 }, { "epoch": 0.029125648045669018, "grad_norm": 1.5439543724060059, "learning_rate": 9.980000000000001e-06, "loss": 0.053, "step": 500 }, { "epoch": 0.029708161006582398, "grad_norm": 1.1300392150878906, "learning_rate": 1.018e-05, "loss": 0.0529, "step": 510 }, { "epoch": 0.030290673967495775, "grad_norm": 1.5923973321914673, "learning_rate": 1.038e-05, "loss": 0.0481, "step": 520 }, { "epoch": 0.030873186928409155, "grad_norm": 1.2319856882095337, "learning_rate": 1.058e-05, "loss": 0.0573, "step": 530 }, { "epoch": 0.031455699889322536, "grad_norm": 0.9990113377571106, "learning_rate": 1.0780000000000002e-05, "loss": 0.0509, "step": 540 }, { "epoch": 0.03203821285023592, "grad_norm": 1.4038890600204468, "learning_rate": 1.098e-05, "loss": 0.045, "step": 550 }, { "epoch": 0.0326207258111493, "grad_norm": 0.9704393744468689, "learning_rate": 1.118e-05, "loss": 0.0449, "step": 560 }, { "epoch": 0.03320323877206268, "grad_norm": 1.1305203437805176, "learning_rate": 1.1380000000000001e-05, "loss": 0.0405, "step": 570 }, { "epoch": 0.03378575173297606, "grad_norm": 1.1110806465148926, "learning_rate": 1.1580000000000001e-05, "loss": 0.0431, "step": 580 }, { "epoch": 0.03436826469388944, "grad_norm": 1.1169344186782837, "learning_rate": 1.178e-05, "loss": 0.0503, "step": 590 }, { "epoch": 0.03495077765480282, "grad_norm": 0.9643336534500122, "learning_rate": 1.198e-05, "loss": 0.0453, "step": 600 }, { "epoch": 0.0355332906157162, "grad_norm": 1.1039515733718872, "learning_rate": 1.2180000000000002e-05, "loss": 0.0556, "step": 610 }, { "epoch": 0.03611580357662958, "grad_norm": 1.6178942918777466, "learning_rate": 1.238e-05, "loss": 0.0497, "step": 620 }, { "epoch": 0.03669831653754296, "grad_norm": 1.2969835996627808, "learning_rate": 1.258e-05, "loss": 0.0461, "step": 630 }, { "epoch": 0.03728082949845634, "grad_norm": 0.7697727680206299, "learning_rate": 1.278e-05, "loss": 0.0391, "step": 640 }, { "epoch": 0.037863342459369724, "grad_norm": 3.228864908218384, "learning_rate": 1.2980000000000001e-05, "loss": 0.0545, "step": 650 }, { "epoch": 0.0384458554202831, "grad_norm": 1.3539026975631714, "learning_rate": 1.3180000000000001e-05, "loss": 0.0452, "step": 660 }, { "epoch": 0.039028368381196485, "grad_norm": 0.9900804162025452, "learning_rate": 1.338e-05, "loss": 0.0556, "step": 670 }, { "epoch": 0.03961088134210986, "grad_norm": 1.2056667804718018, "learning_rate": 1.358e-05, "loss": 0.0477, "step": 680 }, { "epoch": 0.040193394303023246, "grad_norm": 0.956168532371521, "learning_rate": 1.3780000000000002e-05, "loss": 0.043, "step": 690 }, { "epoch": 0.04077590726393662, "grad_norm": 1.0954511165618896, "learning_rate": 1.3980000000000002e-05, "loss": 0.0445, "step": 700 }, { "epoch": 0.04135842022485, "grad_norm": 0.995133638381958, "learning_rate": 1.4180000000000001e-05, "loss": 0.0503, "step": 710 }, { "epoch": 0.041940933185763384, "grad_norm": 1.333242416381836, "learning_rate": 1.4380000000000001e-05, "loss": 0.0449, "step": 720 }, { "epoch": 0.04252344614667676, "grad_norm": 1.0642189979553223, "learning_rate": 1.4580000000000003e-05, "loss": 0.0441, "step": 730 }, { "epoch": 0.043105959107590144, "grad_norm": 1.1864749193191528, "learning_rate": 1.4779999999999999e-05, "loss": 0.0369, "step": 740 }, { "epoch": 0.04368847206850352, "grad_norm": 1.6109168529510498, "learning_rate": 1.4979999999999999e-05, "loss": 0.0386, "step": 750 }, { "epoch": 0.044270985029416905, "grad_norm": 1.1281574964523315, "learning_rate": 1.518e-05, "loss": 0.0351, "step": 760 }, { "epoch": 0.04485349799033028, "grad_norm": 0.8592135310173035, "learning_rate": 1.538e-05, "loss": 0.0355, "step": 770 }, { "epoch": 0.045436010951243666, "grad_norm": 0.9938865900039673, "learning_rate": 1.558e-05, "loss": 0.0386, "step": 780 }, { "epoch": 0.04601852391215704, "grad_norm": 1.2555829286575317, "learning_rate": 1.578e-05, "loss": 0.0371, "step": 790 }, { "epoch": 0.04660103687307043, "grad_norm": 1.0531145334243774, "learning_rate": 1.598e-05, "loss": 0.0412, "step": 800 }, { "epoch": 0.047183549833983804, "grad_norm": 0.8126295208930969, "learning_rate": 1.618e-05, "loss": 0.0375, "step": 810 }, { "epoch": 0.04776606279489719, "grad_norm": 1.112064242362976, "learning_rate": 1.6380000000000002e-05, "loss": 0.0408, "step": 820 }, { "epoch": 0.048348575755810565, "grad_norm": 0.8695337772369385, "learning_rate": 1.658e-05, "loss": 0.0391, "step": 830 }, { "epoch": 0.04893108871672395, "grad_norm": 0.8934742212295532, "learning_rate": 1.6780000000000002e-05, "loss": 0.0367, "step": 840 }, { "epoch": 0.049513601677637326, "grad_norm": 0.9549902081489563, "learning_rate": 1.698e-05, "loss": 0.0375, "step": 850 }, { "epoch": 0.05009611463855071, "grad_norm": 1.1114585399627686, "learning_rate": 1.718e-05, "loss": 0.0396, "step": 860 }, { "epoch": 0.050678627599464086, "grad_norm": 0.7776467204093933, "learning_rate": 1.7380000000000003e-05, "loss": 0.0387, "step": 870 }, { "epoch": 0.05126114056037747, "grad_norm": 0.6502721309661865, "learning_rate": 1.758e-05, "loss": 0.0384, "step": 880 }, { "epoch": 0.05184365352129085, "grad_norm": 0.6057956218719482, "learning_rate": 1.7780000000000003e-05, "loss": 0.0335, "step": 890 }, { "epoch": 0.05242616648220423, "grad_norm": 1.1319633722305298, "learning_rate": 1.798e-05, "loss": 0.0411, "step": 900 }, { "epoch": 0.05300867944311761, "grad_norm": 0.984447717666626, "learning_rate": 1.818e-05, "loss": 0.038, "step": 910 }, { "epoch": 0.05359119240403099, "grad_norm": 0.7957633137702942, "learning_rate": 1.838e-05, "loss": 0.0413, "step": 920 }, { "epoch": 0.05417370536494437, "grad_norm": 0.8273193836212158, "learning_rate": 1.858e-05, "loss": 0.0314, "step": 930 }, { "epoch": 0.05475621832585775, "grad_norm": 0.9830345511436462, "learning_rate": 1.878e-05, "loss": 0.0311, "step": 940 }, { "epoch": 0.05533873128677113, "grad_norm": 1.193394660949707, "learning_rate": 1.898e-05, "loss": 0.0418, "step": 950 }, { "epoch": 0.055921244247684514, "grad_norm": 0.9747568368911743, "learning_rate": 1.918e-05, "loss": 0.0402, "step": 960 }, { "epoch": 0.05650375720859789, "grad_norm": 0.7725551128387451, "learning_rate": 1.938e-05, "loss": 0.0407, "step": 970 }, { "epoch": 0.057086270169511275, "grad_norm": 0.8125936985015869, "learning_rate": 1.9580000000000002e-05, "loss": 0.0361, "step": 980 }, { "epoch": 0.05766878313042465, "grad_norm": 1.088542103767395, "learning_rate": 1.978e-05, "loss": 0.037, "step": 990 }, { "epoch": 0.058251296091338035, "grad_norm": 0.812275767326355, "learning_rate": 1.9980000000000002e-05, "loss": 0.0374, "step": 1000 }, { "epoch": 0.05883380905225141, "grad_norm": 0.7162421941757202, "learning_rate": 2.0180000000000003e-05, "loss": 0.0314, "step": 1010 }, { "epoch": 0.059416322013164796, "grad_norm": 0.6886436343193054, "learning_rate": 2.038e-05, "loss": 0.031, "step": 1020 }, { "epoch": 0.05999883497407817, "grad_norm": 0.9985455274581909, "learning_rate": 2.0580000000000003e-05, "loss": 0.0376, "step": 1030 }, { "epoch": 0.06058134793499155, "grad_norm": 0.7475337386131287, "learning_rate": 2.078e-05, "loss": 0.0274, "step": 1040 }, { "epoch": 0.061163860895904934, "grad_norm": 1.288151502609253, "learning_rate": 2.098e-05, "loss": 0.0318, "step": 1050 }, { "epoch": 0.06174637385681831, "grad_norm": 1.1538382768630981, "learning_rate": 2.118e-05, "loss": 0.0346, "step": 1060 }, { "epoch": 0.062328886817731695, "grad_norm": 0.9230291843414307, "learning_rate": 2.138e-05, "loss": 0.0332, "step": 1070 }, { "epoch": 0.06291139977864507, "grad_norm": 0.9989129304885864, "learning_rate": 2.158e-05, "loss": 0.0361, "step": 1080 }, { "epoch": 0.06349391273955846, "grad_norm": 0.8104183673858643, "learning_rate": 2.178e-05, "loss": 0.0229, "step": 1090 }, { "epoch": 0.06407642570047184, "grad_norm": 0.7136157155036926, "learning_rate": 2.198e-05, "loss": 0.0276, "step": 1100 }, { "epoch": 0.06465893866138521, "grad_norm": 1.033891201019287, "learning_rate": 2.218e-05, "loss": 0.0289, "step": 1110 }, { "epoch": 0.0652414516222986, "grad_norm": 1.0036697387695312, "learning_rate": 2.2380000000000003e-05, "loss": 0.0351, "step": 1120 }, { "epoch": 0.06582396458321198, "grad_norm": 0.8960000276565552, "learning_rate": 2.258e-05, "loss": 0.0399, "step": 1130 }, { "epoch": 0.06640647754412536, "grad_norm": 0.8655632734298706, "learning_rate": 2.2780000000000002e-05, "loss": 0.0308, "step": 1140 }, { "epoch": 0.06698899050503873, "grad_norm": 0.8785617351531982, "learning_rate": 2.298e-05, "loss": 0.0318, "step": 1150 }, { "epoch": 0.06757150346595212, "grad_norm": 0.9061183333396912, "learning_rate": 2.318e-05, "loss": 0.0309, "step": 1160 }, { "epoch": 0.0681540164268655, "grad_norm": 0.8667035102844238, "learning_rate": 2.3380000000000003e-05, "loss": 0.0277, "step": 1170 }, { "epoch": 0.06873652938777888, "grad_norm": 0.9552799463272095, "learning_rate": 2.358e-05, "loss": 0.0276, "step": 1180 }, { "epoch": 0.06931904234869225, "grad_norm": 0.8793132305145264, "learning_rate": 2.3780000000000003e-05, "loss": 0.026, "step": 1190 }, { "epoch": 0.06990155530960564, "grad_norm": 0.569671094417572, "learning_rate": 2.398e-05, "loss": 0.0287, "step": 1200 }, { "epoch": 0.07048406827051902, "grad_norm": 0.5862597823143005, "learning_rate": 2.418e-05, "loss": 0.0298, "step": 1210 }, { "epoch": 0.0710665812314324, "grad_norm": 0.6951908469200134, "learning_rate": 2.438e-05, "loss": 0.0276, "step": 1220 }, { "epoch": 0.07164909419234577, "grad_norm": 0.6039175987243652, "learning_rate": 2.4580000000000002e-05, "loss": 0.0274, "step": 1230 }, { "epoch": 0.07223160715325916, "grad_norm": 0.5822569727897644, "learning_rate": 2.478e-05, "loss": 0.0238, "step": 1240 }, { "epoch": 0.07281412011417254, "grad_norm": 0.8416801691055298, "learning_rate": 2.498e-05, "loss": 0.0306, "step": 1250 }, { "epoch": 0.07339663307508593, "grad_norm": 0.7263565063476562, "learning_rate": 2.5180000000000003e-05, "loss": 0.0244, "step": 1260 }, { "epoch": 0.0739791460359993, "grad_norm": 0.9955845475196838, "learning_rate": 2.5380000000000004e-05, "loss": 0.0265, "step": 1270 }, { "epoch": 0.07456165899691268, "grad_norm": 0.9630759358406067, "learning_rate": 2.5580000000000002e-05, "loss": 0.0266, "step": 1280 }, { "epoch": 0.07514417195782606, "grad_norm": 0.6545844674110413, "learning_rate": 2.5779999999999997e-05, "loss": 0.0293, "step": 1290 }, { "epoch": 0.07572668491873945, "grad_norm": 0.5347998142242432, "learning_rate": 2.598e-05, "loss": 0.0261, "step": 1300 }, { "epoch": 0.07630919787965282, "grad_norm": 0.6423347592353821, "learning_rate": 2.618e-05, "loss": 0.027, "step": 1310 }, { "epoch": 0.0768917108405662, "grad_norm": 0.9828246831893921, "learning_rate": 2.6379999999999998e-05, "loss": 0.0322, "step": 1320 }, { "epoch": 0.07747422380147959, "grad_norm": 0.9521007537841797, "learning_rate": 2.658e-05, "loss": 0.0289, "step": 1330 }, { "epoch": 0.07805673676239297, "grad_norm": 0.9038381576538086, "learning_rate": 2.678e-05, "loss": 0.0262, "step": 1340 }, { "epoch": 0.07863924972330634, "grad_norm": 0.6776038408279419, "learning_rate": 2.698e-05, "loss": 0.0278, "step": 1350 }, { "epoch": 0.07922176268421972, "grad_norm": 1.004285454750061, "learning_rate": 2.718e-05, "loss": 0.0258, "step": 1360 }, { "epoch": 0.07980427564513311, "grad_norm": 0.5752717852592468, "learning_rate": 2.738e-05, "loss": 0.0219, "step": 1370 }, { "epoch": 0.08038678860604649, "grad_norm": 0.6365467309951782, "learning_rate": 2.758e-05, "loss": 0.025, "step": 1380 }, { "epoch": 0.08096930156695986, "grad_norm": 0.5218271613121033, "learning_rate": 2.778e-05, "loss": 0.0216, "step": 1390 }, { "epoch": 0.08155181452787325, "grad_norm": 0.8556839227676392, "learning_rate": 2.798e-05, "loss": 0.0281, "step": 1400 }, { "epoch": 0.08213432748878663, "grad_norm": 0.7402660250663757, "learning_rate": 2.818e-05, "loss": 0.0218, "step": 1410 }, { "epoch": 0.0827168404497, "grad_norm": 0.5787367820739746, "learning_rate": 2.8380000000000003e-05, "loss": 0.0302, "step": 1420 }, { "epoch": 0.08329935341061338, "grad_norm": 0.7988361120223999, "learning_rate": 2.858e-05, "loss": 0.0224, "step": 1430 }, { "epoch": 0.08388186637152677, "grad_norm": 0.7526723742485046, "learning_rate": 2.8780000000000002e-05, "loss": 0.024, "step": 1440 }, { "epoch": 0.08446437933244015, "grad_norm": 0.6987826228141785, "learning_rate": 2.898e-05, "loss": 0.0268, "step": 1450 }, { "epoch": 0.08504689229335352, "grad_norm": 0.5740104913711548, "learning_rate": 2.9180000000000002e-05, "loss": 0.0226, "step": 1460 }, { "epoch": 0.0856294052542669, "grad_norm": 0.7381904125213623, "learning_rate": 2.9380000000000003e-05, "loss": 0.0268, "step": 1470 }, { "epoch": 0.08621191821518029, "grad_norm": 0.6562973260879517, "learning_rate": 2.958e-05, "loss": 0.0294, "step": 1480 }, { "epoch": 0.08679443117609367, "grad_norm": 0.7674175500869751, "learning_rate": 2.9780000000000003e-05, "loss": 0.0236, "step": 1490 }, { "epoch": 0.08737694413700704, "grad_norm": 0.7125323414802551, "learning_rate": 2.998e-05, "loss": 0.0298, "step": 1500 }, { "epoch": 0.08795945709792043, "grad_norm": 0.6497645378112793, "learning_rate": 3.0180000000000002e-05, "loss": 0.0225, "step": 1510 }, { "epoch": 0.08854197005883381, "grad_norm": 0.7530396580696106, "learning_rate": 3.0380000000000004e-05, "loss": 0.0239, "step": 1520 }, { "epoch": 0.0891244830197472, "grad_norm": 0.5879446268081665, "learning_rate": 3.058e-05, "loss": 0.0249, "step": 1530 }, { "epoch": 0.08970699598066056, "grad_norm": 0.694493293762207, "learning_rate": 3.078e-05, "loss": 0.0253, "step": 1540 }, { "epoch": 0.09028950894157395, "grad_norm": 0.3544371426105499, "learning_rate": 3.0980000000000005e-05, "loss": 0.0248, "step": 1550 }, { "epoch": 0.09087202190248733, "grad_norm": 0.576510488986969, "learning_rate": 3.118e-05, "loss": 0.0233, "step": 1560 }, { "epoch": 0.09145453486340072, "grad_norm": 0.6982611417770386, "learning_rate": 3.138e-05, "loss": 0.021, "step": 1570 }, { "epoch": 0.09203704782431409, "grad_norm": 0.451580673456192, "learning_rate": 3.1580000000000006e-05, "loss": 0.0232, "step": 1580 }, { "epoch": 0.09261956078522747, "grad_norm": 0.5893693566322327, "learning_rate": 3.1780000000000004e-05, "loss": 0.0212, "step": 1590 }, { "epoch": 0.09320207374614085, "grad_norm": 0.7551915645599365, "learning_rate": 3.198e-05, "loss": 0.0213, "step": 1600 }, { "epoch": 0.09378458670705424, "grad_norm": 0.6255674362182617, "learning_rate": 3.218e-05, "loss": 0.0184, "step": 1610 }, { "epoch": 0.09436709966796761, "grad_norm": 0.49846622347831726, "learning_rate": 3.238e-05, "loss": 0.0171, "step": 1620 }, { "epoch": 0.09494961262888099, "grad_norm": 0.5102967023849487, "learning_rate": 3.2579999999999996e-05, "loss": 0.0219, "step": 1630 }, { "epoch": 0.09553212558979438, "grad_norm": 0.8417807817459106, "learning_rate": 3.278e-05, "loss": 0.0218, "step": 1640 }, { "epoch": 0.09611463855070776, "grad_norm": 0.6185486912727356, "learning_rate": 3.298e-05, "loss": 0.0229, "step": 1650 }, { "epoch": 0.09669715151162113, "grad_norm": 0.46000969409942627, "learning_rate": 3.318e-05, "loss": 0.0179, "step": 1660 }, { "epoch": 0.09727966447253451, "grad_norm": 0.6028854250907898, "learning_rate": 3.338e-05, "loss": 0.0243, "step": 1670 }, { "epoch": 0.0978621774334479, "grad_norm": 0.5447357892990112, "learning_rate": 3.358e-05, "loss": 0.0241, "step": 1680 }, { "epoch": 0.09844469039436128, "grad_norm": 0.7885206937789917, "learning_rate": 3.378e-05, "loss": 0.0266, "step": 1690 }, { "epoch": 0.09902720335527465, "grad_norm": 0.8169668912887573, "learning_rate": 3.398e-05, "loss": 0.0257, "step": 1700 }, { "epoch": 0.09960971631618804, "grad_norm": 0.48299190402030945, "learning_rate": 3.418e-05, "loss": 0.0189, "step": 1710 }, { "epoch": 0.10019222927710142, "grad_norm": 0.6562913656234741, "learning_rate": 3.438e-05, "loss": 0.023, "step": 1720 }, { "epoch": 0.10077474223801479, "grad_norm": 0.47423088550567627, "learning_rate": 3.4580000000000004e-05, "loss": 0.0218, "step": 1730 }, { "epoch": 0.10135725519892817, "grad_norm": 0.5578410029411316, "learning_rate": 3.478e-05, "loss": 0.0201, "step": 1740 }, { "epoch": 0.10193976815984156, "grad_norm": 0.47844117879867554, "learning_rate": 3.498e-05, "loss": 0.0235, "step": 1750 }, { "epoch": 0.10252228112075494, "grad_norm": 0.5934709310531616, "learning_rate": 3.518e-05, "loss": 0.0223, "step": 1760 }, { "epoch": 0.10310479408166831, "grad_norm": 0.6920827031135559, "learning_rate": 3.5380000000000003e-05, "loss": 0.0201, "step": 1770 }, { "epoch": 0.1036873070425817, "grad_norm": 0.6225170493125916, "learning_rate": 3.558e-05, "loss": 0.0223, "step": 1780 }, { "epoch": 0.10426982000349508, "grad_norm": 0.6078155636787415, "learning_rate": 3.578e-05, "loss": 0.027, "step": 1790 }, { "epoch": 0.10485233296440846, "grad_norm": 0.577744722366333, "learning_rate": 3.5980000000000004e-05, "loss": 0.021, "step": 1800 }, { "epoch": 0.10543484592532183, "grad_norm": 0.8347064256668091, "learning_rate": 3.618e-05, "loss": 0.026, "step": 1810 }, { "epoch": 0.10601735888623522, "grad_norm": 0.7143003344535828, "learning_rate": 3.638e-05, "loss": 0.0254, "step": 1820 }, { "epoch": 0.1065998718471486, "grad_norm": 0.5803647637367249, "learning_rate": 3.6580000000000006e-05, "loss": 0.021, "step": 1830 }, { "epoch": 0.10718238480806198, "grad_norm": 0.5518302321434021, "learning_rate": 3.6780000000000004e-05, "loss": 0.0219, "step": 1840 }, { "epoch": 0.10776489776897535, "grad_norm": 0.6853287220001221, "learning_rate": 3.698e-05, "loss": 0.0187, "step": 1850 }, { "epoch": 0.10834741072988874, "grad_norm": 0.6786518096923828, "learning_rate": 3.7180000000000007e-05, "loss": 0.0228, "step": 1860 }, { "epoch": 0.10892992369080212, "grad_norm": 0.6265080571174622, "learning_rate": 3.7380000000000005e-05, "loss": 0.0236, "step": 1870 }, { "epoch": 0.1095124366517155, "grad_norm": 0.5566945672035217, "learning_rate": 3.758e-05, "loss": 0.0176, "step": 1880 }, { "epoch": 0.11009494961262888, "grad_norm": 0.5942752957344055, "learning_rate": 3.778000000000001e-05, "loss": 0.018, "step": 1890 }, { "epoch": 0.11067746257354226, "grad_norm": 0.50779789686203, "learning_rate": 3.7980000000000006e-05, "loss": 0.0233, "step": 1900 }, { "epoch": 0.11125997553445564, "grad_norm": 1.0494762659072876, "learning_rate": 3.818e-05, "loss": 0.0225, "step": 1910 }, { "epoch": 0.11184248849536903, "grad_norm": 0.6743618249893188, "learning_rate": 3.838e-05, "loss": 0.0216, "step": 1920 }, { "epoch": 0.1124250014562824, "grad_norm": 0.6243472099304199, "learning_rate": 3.858e-05, "loss": 0.0213, "step": 1930 }, { "epoch": 0.11300751441719578, "grad_norm": 0.5638609528541565, "learning_rate": 3.878e-05, "loss": 0.0183, "step": 1940 }, { "epoch": 0.11359002737810917, "grad_norm": 0.5323771238327026, "learning_rate": 3.898e-05, "loss": 0.0206, "step": 1950 }, { "epoch": 0.11417254033902255, "grad_norm": 0.6081473231315613, "learning_rate": 3.918e-05, "loss": 0.0158, "step": 1960 }, { "epoch": 0.11475505329993592, "grad_norm": 0.5085746049880981, "learning_rate": 3.938e-05, "loss": 0.0213, "step": 1970 }, { "epoch": 0.1153375662608493, "grad_norm": 0.42805761098861694, "learning_rate": 3.958e-05, "loss": 0.0232, "step": 1980 }, { "epoch": 0.11592007922176269, "grad_norm": 0.5383941531181335, "learning_rate": 3.978e-05, "loss": 0.0177, "step": 1990 }, { "epoch": 0.11650259218267607, "grad_norm": 0.3585794270038605, "learning_rate": 3.998e-05, "loss": 0.0166, "step": 2000 }, { "epoch": 0.11708510514358944, "grad_norm": 1.158002495765686, "learning_rate": 4.018e-05, "loss": 0.0168, "step": 2010 }, { "epoch": 0.11766761810450282, "grad_norm": 2.0702521800994873, "learning_rate": 4.038e-05, "loss": 0.0199, "step": 2020 }, { "epoch": 0.11825013106541621, "grad_norm": 0.4061110019683838, "learning_rate": 4.058e-05, "loss": 0.0195, "step": 2030 }, { "epoch": 0.11883264402632959, "grad_norm": 0.37811076641082764, "learning_rate": 4.078e-05, "loss": 0.0214, "step": 2040 }, { "epoch": 0.11941515698724296, "grad_norm": 0.40194132924079895, "learning_rate": 4.0980000000000004e-05, "loss": 0.0168, "step": 2050 }, { "epoch": 0.11999766994815635, "grad_norm": 0.42309334874153137, "learning_rate": 4.118e-05, "loss": 0.0163, "step": 2060 }, { "epoch": 0.12058018290906973, "grad_norm": 0.5154535174369812, "learning_rate": 4.138e-05, "loss": 0.0164, "step": 2070 }, { "epoch": 0.1211626958699831, "grad_norm": 0.3741661608219147, "learning_rate": 4.1580000000000005e-05, "loss": 0.0157, "step": 2080 }, { "epoch": 0.12174520883089648, "grad_norm": 0.45625871419906616, "learning_rate": 4.178e-05, "loss": 0.0168, "step": 2090 }, { "epoch": 0.12232772179180987, "grad_norm": 0.46603503823280334, "learning_rate": 4.198e-05, "loss": 0.0161, "step": 2100 }, { "epoch": 0.12291023475272325, "grad_norm": 0.5300186276435852, "learning_rate": 4.2180000000000006e-05, "loss": 0.0158, "step": 2110 }, { "epoch": 0.12349274771363662, "grad_norm": 0.3945985436439514, "learning_rate": 4.2380000000000004e-05, "loss": 0.0162, "step": 2120 }, { "epoch": 0.12407526067455, "grad_norm": 0.3781861662864685, "learning_rate": 4.258e-05, "loss": 0.0169, "step": 2130 }, { "epoch": 0.12465777363546339, "grad_norm": 0.5979866981506348, "learning_rate": 4.278e-05, "loss": 0.017, "step": 2140 }, { "epoch": 0.12524028659637676, "grad_norm": 1.1459311246871948, "learning_rate": 4.2980000000000005e-05, "loss": 0.0201, "step": 2150 }, { "epoch": 0.12582279955729014, "grad_norm": 0.60248202085495, "learning_rate": 4.318e-05, "loss": 0.0168, "step": 2160 }, { "epoch": 0.12640531251820353, "grad_norm": 0.7197863459587097, "learning_rate": 4.338e-05, "loss": 0.0168, "step": 2170 }, { "epoch": 0.1269878254791169, "grad_norm": 0.5988439917564392, "learning_rate": 4.3580000000000006e-05, "loss": 0.0191, "step": 2180 }, { "epoch": 0.1275703384400303, "grad_norm": 0.6154835820198059, "learning_rate": 4.3780000000000004e-05, "loss": 0.0146, "step": 2190 }, { "epoch": 0.12815285140094368, "grad_norm": 0.5561240911483765, "learning_rate": 4.398e-05, "loss": 0.0178, "step": 2200 }, { "epoch": 0.12873536436185706, "grad_norm": 0.5693632960319519, "learning_rate": 4.418000000000001e-05, "loss": 0.0149, "step": 2210 }, { "epoch": 0.12931787732277042, "grad_norm": 0.3661186099052429, "learning_rate": 4.438e-05, "loss": 0.0153, "step": 2220 }, { "epoch": 0.1299003902836838, "grad_norm": 0.4431229829788208, "learning_rate": 4.458e-05, "loss": 0.0188, "step": 2230 }, { "epoch": 0.1304829032445972, "grad_norm": 0.5412517189979553, "learning_rate": 4.478e-05, "loss": 0.0143, "step": 2240 }, { "epoch": 0.13106541620551057, "grad_norm": 0.41985756158828735, "learning_rate": 4.498e-05, "loss": 0.0189, "step": 2250 }, { "epoch": 0.13164792916642395, "grad_norm": 0.5117793083190918, "learning_rate": 4.518e-05, "loss": 0.0169, "step": 2260 }, { "epoch": 0.13223044212733734, "grad_norm": 0.4169626235961914, "learning_rate": 4.538e-05, "loss": 0.0153, "step": 2270 }, { "epoch": 0.13281295508825072, "grad_norm": 0.561192512512207, "learning_rate": 4.558e-05, "loss": 0.019, "step": 2280 }, { "epoch": 0.1333954680491641, "grad_norm": 0.6099644899368286, "learning_rate": 4.578e-05, "loss": 0.0152, "step": 2290 }, { "epoch": 0.13397798101007746, "grad_norm": 0.363286554813385, "learning_rate": 4.5980000000000004e-05, "loss": 0.0131, "step": 2300 }, { "epoch": 0.13456049397099085, "grad_norm": 0.4099963903427124, "learning_rate": 4.618e-05, "loss": 0.0201, "step": 2310 }, { "epoch": 0.13514300693190423, "grad_norm": 0.49568426609039307, "learning_rate": 4.638e-05, "loss": 0.012, "step": 2320 }, { "epoch": 0.13572551989281761, "grad_norm": 0.5227089524269104, "learning_rate": 4.6580000000000005e-05, "loss": 0.0221, "step": 2330 }, { "epoch": 0.136308032853731, "grad_norm": 0.635858952999115, "learning_rate": 4.678e-05, "loss": 0.0158, "step": 2340 }, { "epoch": 0.13689054581464438, "grad_norm": 0.4881513714790344, "learning_rate": 4.698e-05, "loss": 0.02, "step": 2350 }, { "epoch": 0.13747305877555777, "grad_norm": 0.561577320098877, "learning_rate": 4.718e-05, "loss": 0.0155, "step": 2360 }, { "epoch": 0.13805557173647115, "grad_norm": 0.4685761332511902, "learning_rate": 4.7380000000000004e-05, "loss": 0.0153, "step": 2370 }, { "epoch": 0.1386380846973845, "grad_norm": 0.6399999856948853, "learning_rate": 4.758e-05, "loss": 0.014, "step": 2380 }, { "epoch": 0.1392205976582979, "grad_norm": 0.3637852966785431, "learning_rate": 4.778e-05, "loss": 0.0185, "step": 2390 }, { "epoch": 0.13980311061921127, "grad_norm": 0.8798788189888, "learning_rate": 4.7980000000000005e-05, "loss": 0.0176, "step": 2400 }, { "epoch": 0.14038562358012466, "grad_norm": 0.302896112203598, "learning_rate": 4.818e-05, "loss": 0.0132, "step": 2410 }, { "epoch": 0.14096813654103804, "grad_norm": 0.42611637711524963, "learning_rate": 4.838e-05, "loss": 0.0142, "step": 2420 }, { "epoch": 0.14155064950195143, "grad_norm": 0.4786374866962433, "learning_rate": 4.8580000000000006e-05, "loss": 0.0125, "step": 2430 }, { "epoch": 0.1421331624628648, "grad_norm": 0.38848158717155457, "learning_rate": 4.8780000000000004e-05, "loss": 0.0157, "step": 2440 }, { "epoch": 0.14271567542377817, "grad_norm": 0.4184205234050751, "learning_rate": 4.898e-05, "loss": 0.0146, "step": 2450 }, { "epoch": 0.14329818838469155, "grad_norm": 0.4882209599018097, "learning_rate": 4.918000000000001e-05, "loss": 0.0174, "step": 2460 }, { "epoch": 0.14388070134560493, "grad_norm": 0.5503228306770325, "learning_rate": 4.9380000000000005e-05, "loss": 0.0146, "step": 2470 }, { "epoch": 0.14446321430651832, "grad_norm": 0.46818649768829346, "learning_rate": 4.958e-05, "loss": 0.0152, "step": 2480 }, { "epoch": 0.1450457272674317, "grad_norm": 0.38933342695236206, "learning_rate": 4.978e-05, "loss": 0.0129, "step": 2490 }, { "epoch": 0.14562824022834508, "grad_norm": 0.3939889371395111, "learning_rate": 4.9980000000000006e-05, "loss": 0.0154, "step": 2500 }, { "epoch": 0.14621075318925847, "grad_norm": 0.4957740902900696, "learning_rate": 5.0180000000000004e-05, "loss": 0.0157, "step": 2510 }, { "epoch": 0.14679326615017185, "grad_norm": 0.38307711482048035, "learning_rate": 5.038e-05, "loss": 0.0145, "step": 2520 }, { "epoch": 0.1473757791110852, "grad_norm": 0.45120474696159363, "learning_rate": 5.058000000000001e-05, "loss": 0.0134, "step": 2530 }, { "epoch": 0.1479582920719986, "grad_norm": 0.36444947123527527, "learning_rate": 5.0780000000000005e-05, "loss": 0.017, "step": 2540 }, { "epoch": 0.14854080503291198, "grad_norm": 0.6094910502433777, "learning_rate": 5.098e-05, "loss": 0.0177, "step": 2550 }, { "epoch": 0.14912331799382536, "grad_norm": 0.48176199197769165, "learning_rate": 5.118000000000001e-05, "loss": 0.019, "step": 2560 }, { "epoch": 0.14970583095473874, "grad_norm": 0.6019849181175232, "learning_rate": 5.1380000000000006e-05, "loss": 0.0168, "step": 2570 }, { "epoch": 0.15028834391565213, "grad_norm": 0.8631599545478821, "learning_rate": 5.1580000000000004e-05, "loss": 0.0194, "step": 2580 }, { "epoch": 0.1508708568765655, "grad_norm": 0.6437668204307556, "learning_rate": 5.178000000000001e-05, "loss": 0.0213, "step": 2590 }, { "epoch": 0.1514533698374789, "grad_norm": 0.4733867943286896, "learning_rate": 5.198000000000001e-05, "loss": 0.018, "step": 2600 }, { "epoch": 0.15203588279839225, "grad_norm": 0.6741190552711487, "learning_rate": 5.2180000000000005e-05, "loss": 0.0157, "step": 2610 }, { "epoch": 0.15261839575930564, "grad_norm": 0.6016038656234741, "learning_rate": 5.238000000000001e-05, "loss": 0.017, "step": 2620 }, { "epoch": 0.15320090872021902, "grad_norm": 0.5369425415992737, "learning_rate": 5.258000000000001e-05, "loss": 0.0161, "step": 2630 }, { "epoch": 0.1537834216811324, "grad_norm": 0.37076833844184875, "learning_rate": 5.2780000000000006e-05, "loss": 0.0138, "step": 2640 }, { "epoch": 0.1543659346420458, "grad_norm": 0.5318004488945007, "learning_rate": 5.2980000000000004e-05, "loss": 0.0191, "step": 2650 }, { "epoch": 0.15494844760295917, "grad_norm": 0.4330885410308838, "learning_rate": 5.318000000000001e-05, "loss": 0.0188, "step": 2660 }, { "epoch": 0.15553096056387256, "grad_norm": 0.3233807384967804, "learning_rate": 5.338000000000001e-05, "loss": 0.0165, "step": 2670 }, { "epoch": 0.15611347352478594, "grad_norm": 0.4845235049724579, "learning_rate": 5.3580000000000005e-05, "loss": 0.0167, "step": 2680 }, { "epoch": 0.1566959864856993, "grad_norm": 0.6443028450012207, "learning_rate": 5.378e-05, "loss": 0.0158, "step": 2690 }, { "epoch": 0.15727849944661268, "grad_norm": 0.47870922088623047, "learning_rate": 5.3979999999999995e-05, "loss": 0.0193, "step": 2700 }, { "epoch": 0.15786101240752606, "grad_norm": 0.2681027352809906, "learning_rate": 5.418e-05, "loss": 0.0155, "step": 2710 }, { "epoch": 0.15844352536843945, "grad_norm": 0.7757396697998047, "learning_rate": 5.438e-05, "loss": 0.0159, "step": 2720 }, { "epoch": 0.15902603832935283, "grad_norm": 0.6306728720664978, "learning_rate": 5.4579999999999996e-05, "loss": 0.0184, "step": 2730 }, { "epoch": 0.15960855129026622, "grad_norm": 0.38604074716567993, "learning_rate": 5.478e-05, "loss": 0.0171, "step": 2740 }, { "epoch": 0.1601910642511796, "grad_norm": 0.3736885190010071, "learning_rate": 5.498e-05, "loss": 0.0135, "step": 2750 }, { "epoch": 0.16077357721209298, "grad_norm": 0.5663514733314514, "learning_rate": 5.518e-05, "loss": 0.0152, "step": 2760 }, { "epoch": 0.16135609017300634, "grad_norm": 0.7611909508705139, "learning_rate": 5.538e-05, "loss": 0.0163, "step": 2770 }, { "epoch": 0.16193860313391972, "grad_norm": 0.5880131721496582, "learning_rate": 5.558e-05, "loss": 0.016, "step": 2780 }, { "epoch": 0.1625211160948331, "grad_norm": 0.45431140065193176, "learning_rate": 5.578e-05, "loss": 0.0145, "step": 2790 }, { "epoch": 0.1631036290557465, "grad_norm": 0.4545784890651703, "learning_rate": 5.5979999999999996e-05, "loss": 0.0171, "step": 2800 }, { "epoch": 0.16368614201665987, "grad_norm": 0.4778057932853699, "learning_rate": 5.618e-05, "loss": 0.016, "step": 2810 }, { "epoch": 0.16426865497757326, "grad_norm": 0.7100448608398438, "learning_rate": 5.638e-05, "loss": 0.0149, "step": 2820 }, { "epoch": 0.16485116793848664, "grad_norm": 0.29663315415382385, "learning_rate": 5.658e-05, "loss": 0.0151, "step": 2830 }, { "epoch": 0.1654336808994, "grad_norm": 0.28271105885505676, "learning_rate": 5.678e-05, "loss": 0.015, "step": 2840 }, { "epoch": 0.16601619386031338, "grad_norm": 0.43867939710617065, "learning_rate": 5.698e-05, "loss": 0.0135, "step": 2850 }, { "epoch": 0.16659870682122677, "grad_norm": 0.4974268972873688, "learning_rate": 5.718e-05, "loss": 0.0139, "step": 2860 }, { "epoch": 0.16718121978214015, "grad_norm": 0.41363680362701416, "learning_rate": 5.738e-05, "loss": 0.0132, "step": 2870 }, { "epoch": 0.16776373274305353, "grad_norm": 0.3826027810573578, "learning_rate": 5.758e-05, "loss": 0.0165, "step": 2880 }, { "epoch": 0.16834624570396692, "grad_norm": 0.5121865272521973, "learning_rate": 5.778e-05, "loss": 0.0142, "step": 2890 }, { "epoch": 0.1689287586648803, "grad_norm": 0.463712215423584, "learning_rate": 5.7980000000000004e-05, "loss": 0.0154, "step": 2900 }, { "epoch": 0.16951127162579369, "grad_norm": 0.4119332432746887, "learning_rate": 5.818e-05, "loss": 0.0186, "step": 2910 }, { "epoch": 0.17009378458670704, "grad_norm": 0.48545023798942566, "learning_rate": 5.838e-05, "loss": 0.0139, "step": 2920 }, { "epoch": 0.17067629754762043, "grad_norm": 0.3865881562232971, "learning_rate": 5.858e-05, "loss": 0.0133, "step": 2930 }, { "epoch": 0.1712588105085338, "grad_norm": 0.4394161105155945, "learning_rate": 5.878e-05, "loss": 0.0139, "step": 2940 }, { "epoch": 0.1718413234694472, "grad_norm": 0.3312171399593353, "learning_rate": 5.898e-05, "loss": 0.0129, "step": 2950 }, { "epoch": 0.17242383643036058, "grad_norm": 0.44797638058662415, "learning_rate": 5.918e-05, "loss": 0.0168, "step": 2960 }, { "epoch": 0.17300634939127396, "grad_norm": 0.4827950596809387, "learning_rate": 5.9380000000000004e-05, "loss": 0.0145, "step": 2970 }, { "epoch": 0.17358886235218735, "grad_norm": 0.4608073830604553, "learning_rate": 5.958e-05, "loss": 0.0124, "step": 2980 }, { "epoch": 0.17417137531310073, "grad_norm": 0.52156662940979, "learning_rate": 5.978e-05, "loss": 0.0118, "step": 2990 }, { "epoch": 0.17475388827401409, "grad_norm": 0.2682117819786072, "learning_rate": 5.9980000000000005e-05, "loss": 0.0151, "step": 3000 }, { "epoch": 0.17533640123492747, "grad_norm": 0.5177821516990662, "learning_rate": 6.018e-05, "loss": 0.0151, "step": 3010 }, { "epoch": 0.17591891419584085, "grad_norm": 0.49850231409072876, "learning_rate": 6.038e-05, "loss": 0.0177, "step": 3020 }, { "epoch": 0.17650142715675424, "grad_norm": 0.8092730641365051, "learning_rate": 6.0580000000000006e-05, "loss": 0.0166, "step": 3030 }, { "epoch": 0.17708394011766762, "grad_norm": 0.5744795203208923, "learning_rate": 6.0780000000000004e-05, "loss": 0.0145, "step": 3040 }, { "epoch": 0.177666453078581, "grad_norm": 0.45151400566101074, "learning_rate": 6.098e-05, "loss": 0.016, "step": 3050 }, { "epoch": 0.1782489660394944, "grad_norm": 0.2854032516479492, "learning_rate": 6.118000000000001e-05, "loss": 0.0156, "step": 3060 }, { "epoch": 0.17883147900040777, "grad_norm": 0.4447406828403473, "learning_rate": 6.138e-05, "loss": 0.0178, "step": 3070 }, { "epoch": 0.17941399196132113, "grad_norm": 0.5136816501617432, "learning_rate": 6.158e-05, "loss": 0.0163, "step": 3080 }, { "epoch": 0.1799965049222345, "grad_norm": 0.33760887384414673, "learning_rate": 6.178000000000001e-05, "loss": 0.0163, "step": 3090 }, { "epoch": 0.1805790178831479, "grad_norm": 0.4714753329753876, "learning_rate": 6.198e-05, "loss": 0.0156, "step": 3100 }, { "epoch": 0.18116153084406128, "grad_norm": 0.6138328313827515, "learning_rate": 6.218e-05, "loss": 0.0177, "step": 3110 }, { "epoch": 0.18174404380497466, "grad_norm": 0.7193767428398132, "learning_rate": 6.238000000000001e-05, "loss": 0.0157, "step": 3120 }, { "epoch": 0.18232655676588805, "grad_norm": 0.648130476474762, "learning_rate": 6.258e-05, "loss": 0.0192, "step": 3130 }, { "epoch": 0.18290906972680143, "grad_norm": 0.6144332885742188, "learning_rate": 6.278e-05, "loss": 0.0165, "step": 3140 }, { "epoch": 0.1834915826877148, "grad_norm": 0.5404634475708008, "learning_rate": 6.298000000000001e-05, "loss": 0.0165, "step": 3150 }, { "epoch": 0.18407409564862817, "grad_norm": 0.45073145627975464, "learning_rate": 6.318e-05, "loss": 0.0144, "step": 3160 }, { "epoch": 0.18465660860954156, "grad_norm": 0.3677403926849365, "learning_rate": 6.338e-05, "loss": 0.0147, "step": 3170 }, { "epoch": 0.18523912157045494, "grad_norm": 0.6296012997627258, "learning_rate": 6.358000000000001e-05, "loss": 0.0167, "step": 3180 }, { "epoch": 0.18582163453136832, "grad_norm": 0.5576950311660767, "learning_rate": 6.378e-05, "loss": 0.0184, "step": 3190 }, { "epoch": 0.1864041474922817, "grad_norm": 0.7789751887321472, "learning_rate": 6.398000000000001e-05, "loss": 0.0205, "step": 3200 }, { "epoch": 0.1869866604531951, "grad_norm": 0.42077621817588806, "learning_rate": 6.418000000000001e-05, "loss": 0.016, "step": 3210 }, { "epoch": 0.18756917341410848, "grad_norm": 0.5206339955329895, "learning_rate": 6.438e-05, "loss": 0.0148, "step": 3220 }, { "epoch": 0.18815168637502183, "grad_norm": 0.5122306942939758, "learning_rate": 6.458000000000001e-05, "loss": 0.0147, "step": 3230 }, { "epoch": 0.18873419933593522, "grad_norm": 0.4095894694328308, "learning_rate": 6.478000000000001e-05, "loss": 0.0114, "step": 3240 }, { "epoch": 0.1893167122968486, "grad_norm": 0.3725811839103699, "learning_rate": 6.498e-05, "loss": 0.0118, "step": 3250 }, { "epoch": 0.18989922525776198, "grad_norm": 0.3092801868915558, "learning_rate": 6.518000000000001e-05, "loss": 0.0192, "step": 3260 }, { "epoch": 0.19048173821867537, "grad_norm": 0.39955639839172363, "learning_rate": 6.538000000000001e-05, "loss": 0.0148, "step": 3270 }, { "epoch": 0.19106425117958875, "grad_norm": 0.4038904309272766, "learning_rate": 6.558e-05, "loss": 0.0118, "step": 3280 }, { "epoch": 0.19164676414050213, "grad_norm": 0.512697696685791, "learning_rate": 6.578000000000001e-05, "loss": 0.0157, "step": 3290 }, { "epoch": 0.19222927710141552, "grad_norm": 0.5787442326545715, "learning_rate": 6.598e-05, "loss": 0.014, "step": 3300 }, { "epoch": 0.19281179006232887, "grad_norm": 0.45893457531929016, "learning_rate": 6.618e-05, "loss": 0.0145, "step": 3310 }, { "epoch": 0.19339430302324226, "grad_norm": 0.36508709192276, "learning_rate": 6.638e-05, "loss": 0.0133, "step": 3320 }, { "epoch": 0.19397681598415564, "grad_norm": 0.6395530700683594, "learning_rate": 6.658e-05, "loss": 0.0148, "step": 3330 }, { "epoch": 0.19455932894506903, "grad_norm": 0.39440450072288513, "learning_rate": 6.678e-05, "loss": 0.014, "step": 3340 }, { "epoch": 0.1951418419059824, "grad_norm": 0.4657351076602936, "learning_rate": 6.698e-05, "loss": 0.0162, "step": 3350 }, { "epoch": 0.1957243548668958, "grad_norm": 0.3721694052219391, "learning_rate": 6.718e-05, "loss": 0.0132, "step": 3360 }, { "epoch": 0.19630686782780918, "grad_norm": 0.5680240392684937, "learning_rate": 6.738e-05, "loss": 0.0197, "step": 3370 }, { "epoch": 0.19688938078872256, "grad_norm": 0.43556028604507446, "learning_rate": 6.758e-05, "loss": 0.0168, "step": 3380 }, { "epoch": 0.19747189374963592, "grad_norm": 0.35641565918922424, "learning_rate": 6.778e-05, "loss": 0.0163, "step": 3390 }, { "epoch": 0.1980544067105493, "grad_norm": 0.5464766025543213, "learning_rate": 6.798e-05, "loss": 0.0169, "step": 3400 }, { "epoch": 0.1986369196714627, "grad_norm": 0.6599177122116089, "learning_rate": 6.818e-05, "loss": 0.0167, "step": 3410 }, { "epoch": 0.19921943263237607, "grad_norm": 0.42683884501457214, "learning_rate": 6.838e-05, "loss": 0.0187, "step": 3420 }, { "epoch": 0.19980194559328945, "grad_norm": 0.389678031206131, "learning_rate": 6.858e-05, "loss": 0.0151, "step": 3430 }, { "epoch": 0.20038445855420284, "grad_norm": 0.7086696624755859, "learning_rate": 6.878e-05, "loss": 0.0169, "step": 3440 }, { "epoch": 0.20096697151511622, "grad_norm": 0.32102447748184204, "learning_rate": 6.898e-05, "loss": 0.0143, "step": 3450 }, { "epoch": 0.20154948447602958, "grad_norm": 0.45341673493385315, "learning_rate": 6.918e-05, "loss": 0.0172, "step": 3460 }, { "epoch": 0.20213199743694296, "grad_norm": 0.22212591767311096, "learning_rate": 6.938e-05, "loss": 0.015, "step": 3470 }, { "epoch": 0.20271451039785635, "grad_norm": 0.4833238422870636, "learning_rate": 6.958e-05, "loss": 0.0153, "step": 3480 }, { "epoch": 0.20329702335876973, "grad_norm": 0.32811468839645386, "learning_rate": 6.978e-05, "loss": 0.016, "step": 3490 }, { "epoch": 0.2038795363196831, "grad_norm": 0.5585900545120239, "learning_rate": 6.998e-05, "loss": 0.0187, "step": 3500 }, { "epoch": 0.2044620492805965, "grad_norm": 0.2120962142944336, "learning_rate": 7.018e-05, "loss": 0.0133, "step": 3510 }, { "epoch": 0.20504456224150988, "grad_norm": 0.3685355484485626, "learning_rate": 7.038e-05, "loss": 0.0129, "step": 3520 }, { "epoch": 0.20562707520242327, "grad_norm": 0.7096900343894958, "learning_rate": 7.058e-05, "loss": 0.014, "step": 3530 }, { "epoch": 0.20620958816333662, "grad_norm": 0.38259226083755493, "learning_rate": 7.078e-05, "loss": 0.0165, "step": 3540 }, { "epoch": 0.20679210112425, "grad_norm": 0.40816497802734375, "learning_rate": 7.098e-05, "loss": 0.0157, "step": 3550 }, { "epoch": 0.2073746140851634, "grad_norm": 0.5711497664451599, "learning_rate": 7.118e-05, "loss": 0.0149, "step": 3560 }, { "epoch": 0.20795712704607677, "grad_norm": 0.4738941788673401, "learning_rate": 7.138e-05, "loss": 0.0182, "step": 3570 }, { "epoch": 0.20853964000699016, "grad_norm": 0.4117421805858612, "learning_rate": 7.158e-05, "loss": 0.0159, "step": 3580 }, { "epoch": 0.20912215296790354, "grad_norm": 0.5389886498451233, "learning_rate": 7.178000000000001e-05, "loss": 0.0138, "step": 3590 }, { "epoch": 0.20970466592881692, "grad_norm": 0.3822018802165985, "learning_rate": 7.198e-05, "loss": 0.016, "step": 3600 }, { "epoch": 0.2102871788897303, "grad_norm": 0.22033627331256866, "learning_rate": 7.218e-05, "loss": 0.0126, "step": 3610 }, { "epoch": 0.21086969185064366, "grad_norm": 0.4978594481945038, "learning_rate": 7.238000000000001e-05, "loss": 0.0154, "step": 3620 }, { "epoch": 0.21145220481155705, "grad_norm": 0.4495975971221924, "learning_rate": 7.258e-05, "loss": 0.0173, "step": 3630 }, { "epoch": 0.21203471777247043, "grad_norm": 0.5416079759597778, "learning_rate": 7.278e-05, "loss": 0.0168, "step": 3640 }, { "epoch": 0.21261723073338382, "grad_norm": 0.3828054964542389, "learning_rate": 7.298000000000001e-05, "loss": 0.018, "step": 3650 }, { "epoch": 0.2131997436942972, "grad_norm": 0.3200025260448456, "learning_rate": 7.318e-05, "loss": 0.0128, "step": 3660 }, { "epoch": 0.21378225665521058, "grad_norm": 0.4282485544681549, "learning_rate": 7.338e-05, "loss": 0.0157, "step": 3670 }, { "epoch": 0.21436476961612397, "grad_norm": 0.29910677671432495, "learning_rate": 7.358000000000001e-05, "loss": 0.0187, "step": 3680 }, { "epoch": 0.21494728257703735, "grad_norm": 0.3366243243217468, "learning_rate": 7.378e-05, "loss": 0.0192, "step": 3690 }, { "epoch": 0.2155297955379507, "grad_norm": 0.38609829545021057, "learning_rate": 7.398e-05, "loss": 0.0155, "step": 3700 }, { "epoch": 0.2161123084988641, "grad_norm": 0.4891170561313629, "learning_rate": 7.418000000000001e-05, "loss": 0.0199, "step": 3710 }, { "epoch": 0.21669482145977748, "grad_norm": 0.3061416745185852, "learning_rate": 7.438e-05, "loss": 0.0149, "step": 3720 }, { "epoch": 0.21727733442069086, "grad_norm": 0.4139918386936188, "learning_rate": 7.458000000000001e-05, "loss": 0.0152, "step": 3730 }, { "epoch": 0.21785984738160424, "grad_norm": 0.4281066954135895, "learning_rate": 7.478e-05, "loss": 0.0182, "step": 3740 }, { "epoch": 0.21844236034251763, "grad_norm": 0.5967773795127869, "learning_rate": 7.498e-05, "loss": 0.0142, "step": 3750 }, { "epoch": 0.219024873303431, "grad_norm": 0.4973334074020386, "learning_rate": 7.518000000000001e-05, "loss": 0.0145, "step": 3760 }, { "epoch": 0.2196073862643444, "grad_norm": 0.5683701634407043, "learning_rate": 7.538e-05, "loss": 0.0202, "step": 3770 }, { "epoch": 0.22018989922525775, "grad_norm": 0.20961564779281616, "learning_rate": 7.558e-05, "loss": 0.0137, "step": 3780 }, { "epoch": 0.22077241218617114, "grad_norm": 0.6690961718559265, "learning_rate": 7.578000000000001e-05, "loss": 0.0138, "step": 3790 }, { "epoch": 0.22135492514708452, "grad_norm": 0.3748764395713806, "learning_rate": 7.598e-05, "loss": 0.0164, "step": 3800 }, { "epoch": 0.2219374381079979, "grad_norm": 0.3678983747959137, "learning_rate": 7.618e-05, "loss": 0.0127, "step": 3810 }, { "epoch": 0.2225199510689113, "grad_norm": 0.5865371823310852, "learning_rate": 7.638000000000001e-05, "loss": 0.0173, "step": 3820 }, { "epoch": 0.22310246402982467, "grad_norm": 0.531247079372406, "learning_rate": 7.658e-05, "loss": 0.0191, "step": 3830 }, { "epoch": 0.22368497699073805, "grad_norm": 0.6862895488739014, "learning_rate": 7.678000000000001e-05, "loss": 0.019, "step": 3840 }, { "epoch": 0.2242674899516514, "grad_norm": 0.5703114867210388, "learning_rate": 7.698000000000001e-05, "loss": 0.0167, "step": 3850 }, { "epoch": 0.2248500029125648, "grad_norm": 0.40875425934791565, "learning_rate": 7.718e-05, "loss": 0.0167, "step": 3860 }, { "epoch": 0.22543251587347818, "grad_norm": 0.47063398361206055, "learning_rate": 7.738000000000001e-05, "loss": 0.0147, "step": 3870 }, { "epoch": 0.22601502883439156, "grad_norm": 0.42150571942329407, "learning_rate": 7.758000000000001e-05, "loss": 0.014, "step": 3880 }, { "epoch": 0.22659754179530495, "grad_norm": 0.3486264646053314, "learning_rate": 7.778e-05, "loss": 0.0134, "step": 3890 }, { "epoch": 0.22718005475621833, "grad_norm": 0.5594764351844788, "learning_rate": 7.798000000000001e-05, "loss": 0.0149, "step": 3900 }, { "epoch": 0.22776256771713171, "grad_norm": 0.29497361183166504, "learning_rate": 7.818000000000001e-05, "loss": 0.0166, "step": 3910 }, { "epoch": 0.2283450806780451, "grad_norm": 0.34858962893486023, "learning_rate": 7.838e-05, "loss": 0.0142, "step": 3920 }, { "epoch": 0.22892759363895845, "grad_norm": 0.3262551724910736, "learning_rate": 7.858000000000001e-05, "loss": 0.0168, "step": 3930 }, { "epoch": 0.22951010659987184, "grad_norm": 0.45243310928344727, "learning_rate": 7.878e-05, "loss": 0.0167, "step": 3940 }, { "epoch": 0.23009261956078522, "grad_norm": 0.38618820905685425, "learning_rate": 7.897999999999999e-05, "loss": 0.0171, "step": 3950 }, { "epoch": 0.2306751325216986, "grad_norm": 0.45082828402519226, "learning_rate": 7.918e-05, "loss": 0.0153, "step": 3960 }, { "epoch": 0.231257645482612, "grad_norm": 0.4792172908782959, "learning_rate": 7.938e-05, "loss": 0.0189, "step": 3970 }, { "epoch": 0.23184015844352537, "grad_norm": 0.4818764328956604, "learning_rate": 7.958e-05, "loss": 0.0153, "step": 3980 }, { "epoch": 0.23242267140443876, "grad_norm": 0.4380292296409607, "learning_rate": 7.978e-05, "loss": 0.0163, "step": 3990 }, { "epoch": 0.23300518436535214, "grad_norm": 0.36028772592544556, "learning_rate": 7.998e-05, "loss": 0.0193, "step": 4000 }, { "epoch": 0.2335876973262655, "grad_norm": 0.5816632509231567, "learning_rate": 8.018e-05, "loss": 0.0186, "step": 4010 }, { "epoch": 0.23417021028717888, "grad_norm": 0.6554579138755798, "learning_rate": 8.038e-05, "loss": 0.0157, "step": 4020 }, { "epoch": 0.23475272324809227, "grad_norm": 0.5256491899490356, "learning_rate": 8.058e-05, "loss": 0.0179, "step": 4030 }, { "epoch": 0.23533523620900565, "grad_norm": 0.20740336179733276, "learning_rate": 8.078e-05, "loss": 0.0155, "step": 4040 }, { "epoch": 0.23591774916991903, "grad_norm": 0.3431355357170105, "learning_rate": 8.098e-05, "loss": 0.0121, "step": 4050 }, { "epoch": 0.23650026213083242, "grad_norm": 0.4470638036727905, "learning_rate": 8.118e-05, "loss": 0.0173, "step": 4060 }, { "epoch": 0.2370827750917458, "grad_norm": 0.23415781557559967, "learning_rate": 8.138e-05, "loss": 0.0135, "step": 4070 }, { "epoch": 0.23766528805265918, "grad_norm": 0.5383740663528442, "learning_rate": 8.158e-05, "loss": 0.014, "step": 4080 }, { "epoch": 0.23824780101357254, "grad_norm": 0.29728975892066956, "learning_rate": 8.178e-05, "loss": 0.0127, "step": 4090 }, { "epoch": 0.23883031397448592, "grad_norm": 0.34492120146751404, "learning_rate": 8.198e-05, "loss": 0.0145, "step": 4100 }, { "epoch": 0.2394128269353993, "grad_norm": 0.36722663044929504, "learning_rate": 8.218e-05, "loss": 0.0131, "step": 4110 }, { "epoch": 0.2399953398963127, "grad_norm": 0.42568346858024597, "learning_rate": 8.238000000000001e-05, "loss": 0.0135, "step": 4120 }, { "epoch": 0.24057785285722608, "grad_norm": 0.3725323975086212, "learning_rate": 8.258e-05, "loss": 0.0155, "step": 4130 }, { "epoch": 0.24116036581813946, "grad_norm": 0.4638804495334625, "learning_rate": 8.278e-05, "loss": 0.0162, "step": 4140 }, { "epoch": 0.24174287877905284, "grad_norm": 0.5920254588127136, "learning_rate": 8.298000000000001e-05, "loss": 0.0148, "step": 4150 }, { "epoch": 0.2423253917399662, "grad_norm": 0.44576090574264526, "learning_rate": 8.318e-05, "loss": 0.0137, "step": 4160 }, { "epoch": 0.24290790470087958, "grad_norm": 0.605644702911377, "learning_rate": 8.338e-05, "loss": 0.0195, "step": 4170 }, { "epoch": 0.24349041766179297, "grad_norm": 0.4544399082660675, "learning_rate": 8.358e-05, "loss": 0.0188, "step": 4180 }, { "epoch": 0.24407293062270635, "grad_norm": 0.3738110363483429, "learning_rate": 8.378e-05, "loss": 0.0174, "step": 4190 }, { "epoch": 0.24465544358361974, "grad_norm": 0.3627634048461914, "learning_rate": 8.398e-05, "loss": 0.0149, "step": 4200 }, { "epoch": 0.24523795654453312, "grad_norm": 0.31631210446357727, "learning_rate": 8.418e-05, "loss": 0.014, "step": 4210 }, { "epoch": 0.2458204695054465, "grad_norm": 0.30765944719314575, "learning_rate": 8.438e-05, "loss": 0.0143, "step": 4220 }, { "epoch": 0.2464029824663599, "grad_norm": 0.20427478849887848, "learning_rate": 8.458e-05, "loss": 0.014, "step": 4230 }, { "epoch": 0.24698549542727324, "grad_norm": 0.45903149247169495, "learning_rate": 8.478e-05, "loss": 0.016, "step": 4240 }, { "epoch": 0.24756800838818663, "grad_norm": 0.48505979776382446, "learning_rate": 8.498e-05, "loss": 0.0176, "step": 4250 }, { "epoch": 0.2481505213491, "grad_norm": 0.3721117675304413, "learning_rate": 8.518000000000001e-05, "loss": 0.0179, "step": 4260 }, { "epoch": 0.2487330343100134, "grad_norm": 0.42781516909599304, "learning_rate": 8.538e-05, "loss": 0.0155, "step": 4270 }, { "epoch": 0.24931554727092678, "grad_norm": 0.3649612367153168, "learning_rate": 8.558e-05, "loss": 0.0142, "step": 4280 }, { "epoch": 0.24989806023184016, "grad_norm": 0.3682873249053955, "learning_rate": 8.578000000000001e-05, "loss": 0.0136, "step": 4290 }, { "epoch": 0.2504805731927535, "grad_norm": 0.4417450726032257, "learning_rate": 8.598e-05, "loss": 0.015, "step": 4300 }, { "epoch": 0.2510630861536669, "grad_norm": 0.49168309569358826, "learning_rate": 8.618e-05, "loss": 0.0119, "step": 4310 }, { "epoch": 0.2516455991145803, "grad_norm": 0.2896113395690918, "learning_rate": 8.638000000000001e-05, "loss": 0.0142, "step": 4320 }, { "epoch": 0.25222811207549367, "grad_norm": 0.3587171733379364, "learning_rate": 8.658e-05, "loss": 0.0121, "step": 4330 }, { "epoch": 0.25281062503640706, "grad_norm": 0.36441630125045776, "learning_rate": 8.678e-05, "loss": 0.0137, "step": 4340 }, { "epoch": 0.25339313799732044, "grad_norm": 0.5269341468811035, "learning_rate": 8.698000000000001e-05, "loss": 0.0132, "step": 4350 }, { "epoch": 0.2539756509582338, "grad_norm": 0.4590550363063812, "learning_rate": 8.718e-05, "loss": 0.0129, "step": 4360 }, { "epoch": 0.2545581639191472, "grad_norm": 0.3238987624645233, "learning_rate": 8.738000000000001e-05, "loss": 0.014, "step": 4370 }, { "epoch": 0.2551406768800606, "grad_norm": 0.33618202805519104, "learning_rate": 8.758000000000001e-05, "loss": 0.0115, "step": 4380 }, { "epoch": 0.255723189840974, "grad_norm": 0.4079549014568329, "learning_rate": 8.778e-05, "loss": 0.0198, "step": 4390 }, { "epoch": 0.25630570280188736, "grad_norm": 0.5520066618919373, "learning_rate": 8.798000000000001e-05, "loss": 0.0224, "step": 4400 }, { "epoch": 0.25688821576280074, "grad_norm": 0.8034793734550476, "learning_rate": 8.818000000000001e-05, "loss": 0.0182, "step": 4410 }, { "epoch": 0.2574707287237141, "grad_norm": 0.5385733246803284, "learning_rate": 8.838e-05, "loss": 0.0185, "step": 4420 }, { "epoch": 0.2580532416846275, "grad_norm": 0.4828646183013916, "learning_rate": 8.858000000000001e-05, "loss": 0.0174, "step": 4430 }, { "epoch": 0.25863575464554084, "grad_norm": 0.45874667167663574, "learning_rate": 8.878000000000001e-05, "loss": 0.023, "step": 4440 }, { "epoch": 0.2592182676064542, "grad_norm": 0.4372740089893341, "learning_rate": 8.898e-05, "loss": 0.0157, "step": 4450 }, { "epoch": 0.2598007805673676, "grad_norm": 0.5255483984947205, "learning_rate": 8.918000000000001e-05, "loss": 0.0168, "step": 4460 }, { "epoch": 0.260383293528281, "grad_norm": 0.3865438401699066, "learning_rate": 8.938e-05, "loss": 0.0167, "step": 4470 }, { "epoch": 0.2609658064891944, "grad_norm": 0.42038583755493164, "learning_rate": 8.958e-05, "loss": 0.0146, "step": 4480 }, { "epoch": 0.26154831945010776, "grad_norm": 0.5365887880325317, "learning_rate": 8.978000000000001e-05, "loss": 0.0185, "step": 4490 }, { "epoch": 0.26213083241102114, "grad_norm": 0.4284862279891968, "learning_rate": 8.998e-05, "loss": 0.0174, "step": 4500 }, { "epoch": 0.2627133453719345, "grad_norm": 0.5291692018508911, "learning_rate": 9.018000000000001e-05, "loss": 0.0237, "step": 4510 }, { "epoch": 0.2632958583328479, "grad_norm": 0.5086433291435242, "learning_rate": 9.038000000000001e-05, "loss": 0.0188, "step": 4520 }, { "epoch": 0.2638783712937613, "grad_norm": 0.27153852581977844, "learning_rate": 9.058e-05, "loss": 0.0176, "step": 4530 }, { "epoch": 0.2644608842546747, "grad_norm": 0.23696446418762207, "learning_rate": 9.078000000000001e-05, "loss": 0.0134, "step": 4540 }, { "epoch": 0.26504339721558806, "grad_norm": 0.3463980257511139, "learning_rate": 9.098000000000001e-05, "loss": 0.0135, "step": 4550 }, { "epoch": 0.26562591017650145, "grad_norm": 0.4919293522834778, "learning_rate": 9.118e-05, "loss": 0.0147, "step": 4560 }, { "epoch": 0.26620842313741483, "grad_norm": 0.4248889088630676, "learning_rate": 9.138e-05, "loss": 0.0136, "step": 4570 }, { "epoch": 0.2667909360983282, "grad_norm": 0.6791334748268127, "learning_rate": 9.158e-05, "loss": 0.0171, "step": 4580 }, { "epoch": 0.26737344905924154, "grad_norm": 0.35522598028182983, "learning_rate": 9.178e-05, "loss": 0.0139, "step": 4590 }, { "epoch": 0.2679559620201549, "grad_norm": 0.41408413648605347, "learning_rate": 9.198e-05, "loss": 0.0149, "step": 4600 }, { "epoch": 0.2685384749810683, "grad_norm": 0.3771199584007263, "learning_rate": 9.218e-05, "loss": 0.0155, "step": 4610 }, { "epoch": 0.2691209879419817, "grad_norm": 0.2775745093822479, "learning_rate": 9.238e-05, "loss": 0.0166, "step": 4620 }, { "epoch": 0.2697035009028951, "grad_norm": 0.4943523705005646, "learning_rate": 9.258e-05, "loss": 0.0167, "step": 4630 }, { "epoch": 0.27028601386380846, "grad_norm": 0.5490389466285706, "learning_rate": 9.278e-05, "loss": 0.0184, "step": 4640 }, { "epoch": 0.27086852682472184, "grad_norm": 0.5853617787361145, "learning_rate": 9.298e-05, "loss": 0.0167, "step": 4650 }, { "epoch": 0.27145103978563523, "grad_norm": 0.3680674433708191, "learning_rate": 9.318e-05, "loss": 0.0213, "step": 4660 }, { "epoch": 0.2720335527465486, "grad_norm": 0.3059559166431427, "learning_rate": 9.338e-05, "loss": 0.0161, "step": 4670 }, { "epoch": 0.272616065707462, "grad_norm": 0.2864623963832855, "learning_rate": 9.358e-05, "loss": 0.0123, "step": 4680 }, { "epoch": 0.2731985786683754, "grad_norm": 0.2930346131324768, "learning_rate": 9.378e-05, "loss": 0.0174, "step": 4690 }, { "epoch": 0.27378109162928876, "grad_norm": 0.46440717577934265, "learning_rate": 9.398e-05, "loss": 0.0191, "step": 4700 }, { "epoch": 0.27436360459020215, "grad_norm": 0.32900869846343994, "learning_rate": 9.418e-05, "loss": 0.0167, "step": 4710 }, { "epoch": 0.27494611755111553, "grad_norm": 0.32534053921699524, "learning_rate": 9.438e-05, "loss": 0.0161, "step": 4720 }, { "epoch": 0.2755286305120289, "grad_norm": 0.41493141651153564, "learning_rate": 9.458e-05, "loss": 0.0159, "step": 4730 }, { "epoch": 0.2761111434729423, "grad_norm": 0.4969571530818939, "learning_rate": 9.478e-05, "loss": 0.0165, "step": 4740 }, { "epoch": 0.27669365643385563, "grad_norm": 0.5208871960639954, "learning_rate": 9.498e-05, "loss": 0.0164, "step": 4750 }, { "epoch": 0.277276169394769, "grad_norm": 0.29879385232925415, "learning_rate": 9.518000000000001e-05, "loss": 0.0178, "step": 4760 }, { "epoch": 0.2778586823556824, "grad_norm": 0.5161657929420471, "learning_rate": 9.538e-05, "loss": 0.0172, "step": 4770 }, { "epoch": 0.2784411953165958, "grad_norm": 0.4936532974243164, "learning_rate": 9.558e-05, "loss": 0.0167, "step": 4780 }, { "epoch": 0.27902370827750916, "grad_norm": 0.405062198638916, "learning_rate": 9.578000000000001e-05, "loss": 0.0157, "step": 4790 }, { "epoch": 0.27960622123842255, "grad_norm": 0.5146819949150085, "learning_rate": 9.598e-05, "loss": 0.0171, "step": 4800 }, { "epoch": 0.28018873419933593, "grad_norm": 0.4021776020526886, "learning_rate": 9.618e-05, "loss": 0.0144, "step": 4810 }, { "epoch": 0.2807712471602493, "grad_norm": 0.4020695090293884, "learning_rate": 9.638000000000001e-05, "loss": 0.0116, "step": 4820 }, { "epoch": 0.2813537601211627, "grad_norm": 0.3685443103313446, "learning_rate": 9.658e-05, "loss": 0.0151, "step": 4830 }, { "epoch": 0.2819362730820761, "grad_norm": 0.33128228783607483, "learning_rate": 9.678e-05, "loss": 0.02, "step": 4840 }, { "epoch": 0.28251878604298947, "grad_norm": 0.3148438334465027, "learning_rate": 9.698000000000001e-05, "loss": 0.0155, "step": 4850 }, { "epoch": 0.28310129900390285, "grad_norm": 0.5226296782493591, "learning_rate": 9.718e-05, "loss": 0.0155, "step": 4860 }, { "epoch": 0.28368381196481623, "grad_norm": 0.46343016624450684, "learning_rate": 9.738e-05, "loss": 0.0148, "step": 4870 }, { "epoch": 0.2842663249257296, "grad_norm": 0.3761071562767029, "learning_rate": 9.758000000000001e-05, "loss": 0.0182, "step": 4880 }, { "epoch": 0.284848837886643, "grad_norm": 0.5723442435264587, "learning_rate": 9.778e-05, "loss": 0.0168, "step": 4890 }, { "epoch": 0.28543135084755633, "grad_norm": 0.4642567038536072, "learning_rate": 9.798000000000001e-05, "loss": 0.0182, "step": 4900 }, { "epoch": 0.2860138638084697, "grad_norm": 0.6905421018600464, "learning_rate": 9.818000000000001e-05, "loss": 0.0172, "step": 4910 }, { "epoch": 0.2865963767693831, "grad_norm": 0.41226300597190857, "learning_rate": 9.838e-05, "loss": 0.0146, "step": 4920 }, { "epoch": 0.2871788897302965, "grad_norm": 0.3736860752105713, "learning_rate": 9.858000000000001e-05, "loss": 0.0175, "step": 4930 }, { "epoch": 0.28776140269120987, "grad_norm": 0.5065261125564575, "learning_rate": 9.878e-05, "loss": 0.0149, "step": 4940 }, { "epoch": 0.28834391565212325, "grad_norm": 0.4767948091030121, "learning_rate": 9.898e-05, "loss": 0.018, "step": 4950 }, { "epoch": 0.28892642861303663, "grad_norm": 0.2821384370326996, "learning_rate": 9.918000000000001e-05, "loss": 0.0136, "step": 4960 }, { "epoch": 0.28950894157395, "grad_norm": 0.5016849637031555, "learning_rate": 9.938e-05, "loss": 0.0206, "step": 4970 }, { "epoch": 0.2900914545348634, "grad_norm": 0.46624720096588135, "learning_rate": 9.958e-05, "loss": 0.019, "step": 4980 }, { "epoch": 0.2906739674957768, "grad_norm": 0.44486159086227417, "learning_rate": 9.978000000000001e-05, "loss": 0.0143, "step": 4990 }, { "epoch": 0.29125648045669017, "grad_norm": 0.3940882980823517, "learning_rate": 9.998e-05, "loss": 0.0156, "step": 5000 }, { "epoch": 0.29183899341760355, "grad_norm": 0.4051775634288788, "learning_rate": 9.999999778549045e-05, "loss": 0.0181, "step": 5010 }, { "epoch": 0.29242150637851694, "grad_norm": 0.3899575173854828, "learning_rate": 9.999999013039593e-05, "loss": 0.0181, "step": 5020 }, { "epoch": 0.2930040193394303, "grad_norm": 0.527410089969635, "learning_rate": 9.999997700737766e-05, "loss": 0.0173, "step": 5030 }, { "epoch": 0.2935865323003437, "grad_norm": 0.2732304632663727, "learning_rate": 9.999995841643709e-05, "loss": 0.0144, "step": 5040 }, { "epoch": 0.2941690452612571, "grad_norm": 0.4507734179496765, "learning_rate": 9.999993435757623e-05, "loss": 0.017, "step": 5050 }, { "epoch": 0.2947515582221704, "grad_norm": 0.43454140424728394, "learning_rate": 9.999990483079773e-05, "loss": 0.0164, "step": 5060 }, { "epoch": 0.2953340711830838, "grad_norm": 0.4725722372531891, "learning_rate": 9.999986983610481e-05, "loss": 0.0134, "step": 5070 }, { "epoch": 0.2959165841439972, "grad_norm": 0.3465936779975891, "learning_rate": 9.99998293735013e-05, "loss": 0.0174, "step": 5080 }, { "epoch": 0.29649909710491057, "grad_norm": 0.25371071696281433, "learning_rate": 9.999978344299161e-05, "loss": 0.017, "step": 5090 }, { "epoch": 0.29708161006582395, "grad_norm": 0.473132461309433, "learning_rate": 9.99997320445808e-05, "loss": 0.0185, "step": 5100 }, { "epoch": 0.29766412302673734, "grad_norm": 0.40821710228919983, "learning_rate": 9.999967517827444e-05, "loss": 0.016, "step": 5110 }, { "epoch": 0.2982466359876507, "grad_norm": 0.3223572373390198, "learning_rate": 9.999961284407879e-05, "loss": 0.0191, "step": 5120 }, { "epoch": 0.2988291489485641, "grad_norm": 0.4222657382488251, "learning_rate": 9.999954504200067e-05, "loss": 0.0124, "step": 5130 }, { "epoch": 0.2994116619094775, "grad_norm": 0.31765875220298767, "learning_rate": 9.999947177204744e-05, "loss": 0.0129, "step": 5140 }, { "epoch": 0.2999941748703909, "grad_norm": 0.2983816862106323, "learning_rate": 9.999939303422718e-05, "loss": 0.0114, "step": 5150 }, { "epoch": 0.30057668783130426, "grad_norm": 0.4218018651008606, "learning_rate": 9.999930882854847e-05, "loss": 0.0157, "step": 5160 }, { "epoch": 0.30115920079221764, "grad_norm": 0.4760737717151642, "learning_rate": 9.999921915502051e-05, "loss": 0.0104, "step": 5170 }, { "epoch": 0.301741713753131, "grad_norm": 0.402182936668396, "learning_rate": 9.99991240136531e-05, "loss": 0.0128, "step": 5180 }, { "epoch": 0.3023242267140444, "grad_norm": 0.35931339859962463, "learning_rate": 9.999902340445668e-05, "loss": 0.0157, "step": 5190 }, { "epoch": 0.3029067396749578, "grad_norm": 0.3377428352832794, "learning_rate": 9.999891732744224e-05, "loss": 0.0189, "step": 5200 }, { "epoch": 0.3034892526358711, "grad_norm": 0.4079914689064026, "learning_rate": 9.999880578262135e-05, "loss": 0.0126, "step": 5210 }, { "epoch": 0.3040717655967845, "grad_norm": 0.33793607354164124, "learning_rate": 9.999868877000624e-05, "loss": 0.016, "step": 5220 }, { "epoch": 0.3046542785576979, "grad_norm": 0.40488117933273315, "learning_rate": 9.99985662896097e-05, "loss": 0.016, "step": 5230 }, { "epoch": 0.3052367915186113, "grad_norm": 0.2692374587059021, "learning_rate": 9.999843834144513e-05, "loss": 0.0136, "step": 5240 }, { "epoch": 0.30581930447952466, "grad_norm": 0.3444440960884094, "learning_rate": 9.99983049255265e-05, "loss": 0.0134, "step": 5250 }, { "epoch": 0.30640181744043804, "grad_norm": 0.34168821573257446, "learning_rate": 9.999816604186843e-05, "loss": 0.0161, "step": 5260 }, { "epoch": 0.3069843304013514, "grad_norm": 0.3528080880641937, "learning_rate": 9.999802169048609e-05, "loss": 0.0163, "step": 5270 }, { "epoch": 0.3075668433622648, "grad_norm": 0.3115560710430145, "learning_rate": 9.999787187139527e-05, "loss": 0.0158, "step": 5280 }, { "epoch": 0.3081493563231782, "grad_norm": 0.39310547709465027, "learning_rate": 9.999771658461234e-05, "loss": 0.0176, "step": 5290 }, { "epoch": 0.3087318692840916, "grad_norm": 0.5506131649017334, "learning_rate": 9.999755583015431e-05, "loss": 0.0175, "step": 5300 }, { "epoch": 0.30931438224500496, "grad_norm": 0.4961826801300049, "learning_rate": 9.999738960803874e-05, "loss": 0.0172, "step": 5310 }, { "epoch": 0.30989689520591834, "grad_norm": 0.46624624729156494, "learning_rate": 9.99972179182838e-05, "loss": 0.0162, "step": 5320 }, { "epoch": 0.3104794081668317, "grad_norm": 0.4351981580257416, "learning_rate": 9.99970407609083e-05, "loss": 0.0145, "step": 5330 }, { "epoch": 0.3110619211277451, "grad_norm": 0.2980080246925354, "learning_rate": 9.999685813593159e-05, "loss": 0.0161, "step": 5340 }, { "epoch": 0.3116444340886585, "grad_norm": 0.35812610387802124, "learning_rate": 9.999667004337362e-05, "loss": 0.0148, "step": 5350 }, { "epoch": 0.3122269470495719, "grad_norm": 0.3485964834690094, "learning_rate": 9.9996476483255e-05, "loss": 0.0134, "step": 5360 }, { "epoch": 0.3128094600104852, "grad_norm": 0.46924644708633423, "learning_rate": 9.999627745559688e-05, "loss": 0.0166, "step": 5370 }, { "epoch": 0.3133919729713986, "grad_norm": 0.38728177547454834, "learning_rate": 9.999607296042101e-05, "loss": 0.0149, "step": 5380 }, { "epoch": 0.313974485932312, "grad_norm": 0.35936930775642395, "learning_rate": 9.99958629977498e-05, "loss": 0.0146, "step": 5390 }, { "epoch": 0.31455699889322536, "grad_norm": 0.3250254988670349, "learning_rate": 9.999564756760615e-05, "loss": 0.0138, "step": 5400 }, { "epoch": 0.31513951185413874, "grad_norm": 0.206546351313591, "learning_rate": 9.999542667001366e-05, "loss": 0.0149, "step": 5410 }, { "epoch": 0.3157220248150521, "grad_norm": 0.49581554532051086, "learning_rate": 9.999520030499647e-05, "loss": 0.0145, "step": 5420 }, { "epoch": 0.3163045377759655, "grad_norm": 0.36206045746803284, "learning_rate": 9.999496847257936e-05, "loss": 0.017, "step": 5430 }, { "epoch": 0.3168870507368789, "grad_norm": 0.36043521761894226, "learning_rate": 9.999473117278764e-05, "loss": 0.0152, "step": 5440 }, { "epoch": 0.3174695636977923, "grad_norm": 0.5072677731513977, "learning_rate": 9.999448840564731e-05, "loss": 0.0173, "step": 5450 }, { "epoch": 0.31805207665870566, "grad_norm": 0.27587565779685974, "learning_rate": 9.999424017118488e-05, "loss": 0.0131, "step": 5460 }, { "epoch": 0.31863458961961905, "grad_norm": 0.35345056653022766, "learning_rate": 9.999398646942751e-05, "loss": 0.0134, "step": 5470 }, { "epoch": 0.31921710258053243, "grad_norm": 0.36257249116897583, "learning_rate": 9.999372730040296e-05, "loss": 0.0142, "step": 5480 }, { "epoch": 0.3197996155414458, "grad_norm": 0.2715804874897003, "learning_rate": 9.999346266413953e-05, "loss": 0.0127, "step": 5490 }, { "epoch": 0.3203821285023592, "grad_norm": 0.3549053966999054, "learning_rate": 9.99931925606662e-05, "loss": 0.0132, "step": 5500 }, { "epoch": 0.3209646414632726, "grad_norm": 0.33724838495254517, "learning_rate": 9.99929169900125e-05, "loss": 0.0132, "step": 5510 }, { "epoch": 0.32154715442418597, "grad_norm": 0.2593380808830261, "learning_rate": 9.999263595220855e-05, "loss": 0.0108, "step": 5520 }, { "epoch": 0.3221296673850993, "grad_norm": 0.33689215779304504, "learning_rate": 9.99923494472851e-05, "loss": 0.0204, "step": 5530 }, { "epoch": 0.3227121803460127, "grad_norm": 0.2451244294643402, "learning_rate": 9.999205747527348e-05, "loss": 0.013, "step": 5540 }, { "epoch": 0.32329469330692606, "grad_norm": 0.4160633087158203, "learning_rate": 9.999176003620561e-05, "loss": 0.0108, "step": 5550 }, { "epoch": 0.32387720626783945, "grad_norm": 0.4629513621330261, "learning_rate": 9.999145713011405e-05, "loss": 0.0145, "step": 5560 }, { "epoch": 0.32445971922875283, "grad_norm": 0.27118727564811707, "learning_rate": 9.999114875703186e-05, "loss": 0.0136, "step": 5570 }, { "epoch": 0.3250422321896662, "grad_norm": 0.33480867743492126, "learning_rate": 9.999083491699281e-05, "loss": 0.0147, "step": 5580 }, { "epoch": 0.3256247451505796, "grad_norm": 0.4176709055900574, "learning_rate": 9.999051561003123e-05, "loss": 0.015, "step": 5590 }, { "epoch": 0.326207258111493, "grad_norm": 0.3892660439014435, "learning_rate": 9.999019083618202e-05, "loss": 0.0159, "step": 5600 }, { "epoch": 0.32678977107240637, "grad_norm": 0.44021210074424744, "learning_rate": 9.99898605954807e-05, "loss": 0.0174, "step": 5610 }, { "epoch": 0.32737228403331975, "grad_norm": 0.5735315084457397, "learning_rate": 9.998952488796338e-05, "loss": 0.0148, "step": 5620 }, { "epoch": 0.32795479699423313, "grad_norm": 0.4243243932723999, "learning_rate": 9.998918371366676e-05, "loss": 0.0145, "step": 5630 }, { "epoch": 0.3285373099551465, "grad_norm": 0.35645046830177307, "learning_rate": 9.99888370726282e-05, "loss": 0.0185, "step": 5640 }, { "epoch": 0.3291198229160599, "grad_norm": 0.32461854815483093, "learning_rate": 9.998848496488556e-05, "loss": 0.0129, "step": 5650 }, { "epoch": 0.3297023358769733, "grad_norm": 0.24589473009109497, "learning_rate": 9.998812739047736e-05, "loss": 0.0129, "step": 5660 }, { "epoch": 0.33028484883788667, "grad_norm": 0.3177158534526825, "learning_rate": 9.99877643494427e-05, "loss": 0.0137, "step": 5670 }, { "epoch": 0.3308673617988, "grad_norm": 0.2808033525943756, "learning_rate": 9.998739584182128e-05, "loss": 0.0139, "step": 5680 }, { "epoch": 0.3314498747597134, "grad_norm": 0.31433242559432983, "learning_rate": 9.998702186765342e-05, "loss": 0.0169, "step": 5690 }, { "epoch": 0.33203238772062676, "grad_norm": 0.3387039303779602, "learning_rate": 9.998664242698e-05, "loss": 0.0121, "step": 5700 }, { "epoch": 0.33261490068154015, "grad_norm": 0.4055728316307068, "learning_rate": 9.998625751984251e-05, "loss": 0.0187, "step": 5710 }, { "epoch": 0.33319741364245353, "grad_norm": 0.3665350675582886, "learning_rate": 9.998586714628307e-05, "loss": 0.0189, "step": 5720 }, { "epoch": 0.3337799266033669, "grad_norm": 0.2784072458744049, "learning_rate": 9.998547130634432e-05, "loss": 0.0195, "step": 5730 }, { "epoch": 0.3343624395642803, "grad_norm": 0.4538620412349701, "learning_rate": 9.99850700000696e-05, "loss": 0.0139, "step": 5740 }, { "epoch": 0.3349449525251937, "grad_norm": 0.4001680314540863, "learning_rate": 9.998466322750278e-05, "loss": 0.013, "step": 5750 }, { "epoch": 0.33552746548610707, "grad_norm": 0.480720192193985, "learning_rate": 9.998425098868834e-05, "loss": 0.0147, "step": 5760 }, { "epoch": 0.33610997844702045, "grad_norm": 0.49253153800964355, "learning_rate": 9.998383328367136e-05, "loss": 0.0153, "step": 5770 }, { "epoch": 0.33669249140793384, "grad_norm": 0.6121100187301636, "learning_rate": 9.99834101124975e-05, "loss": 0.016, "step": 5780 }, { "epoch": 0.3372750043688472, "grad_norm": 0.2701857089996338, "learning_rate": 9.998298147521309e-05, "loss": 0.0173, "step": 5790 }, { "epoch": 0.3378575173297606, "grad_norm": 0.3838884234428406, "learning_rate": 9.998254737186496e-05, "loss": 0.0153, "step": 5800 }, { "epoch": 0.338440030290674, "grad_norm": 0.2599169611930847, "learning_rate": 9.99821078025006e-05, "loss": 0.0106, "step": 5810 }, { "epoch": 0.33902254325158737, "grad_norm": 0.5707082152366638, "learning_rate": 9.998166276716807e-05, "loss": 0.0118, "step": 5820 }, { "epoch": 0.33960505621250076, "grad_norm": 0.4981538653373718, "learning_rate": 9.998121226591606e-05, "loss": 0.0177, "step": 5830 }, { "epoch": 0.3401875691734141, "grad_norm": 0.48711279034614563, "learning_rate": 9.998075629879382e-05, "loss": 0.0166, "step": 5840 }, { "epoch": 0.34077008213432747, "grad_norm": 0.4262441396713257, "learning_rate": 9.99802948658512e-05, "loss": 0.0197, "step": 5850 }, { "epoch": 0.34135259509524085, "grad_norm": 0.28903353214263916, "learning_rate": 9.99798279671387e-05, "loss": 0.0147, "step": 5860 }, { "epoch": 0.34193510805615424, "grad_norm": 0.3519308269023895, "learning_rate": 9.997935560270734e-05, "loss": 0.0135, "step": 5870 }, { "epoch": 0.3425176210170676, "grad_norm": 0.46700215339660645, "learning_rate": 9.997887777260879e-05, "loss": 0.02, "step": 5880 }, { "epoch": 0.343100133977981, "grad_norm": 0.35438182950019836, "learning_rate": 9.997839447689532e-05, "loss": 0.0136, "step": 5890 }, { "epoch": 0.3436826469388944, "grad_norm": 0.3668670654296875, "learning_rate": 9.997790571561978e-05, "loss": 0.0167, "step": 5900 }, { "epoch": 0.34426515989980777, "grad_norm": 0.30584514141082764, "learning_rate": 9.99774114888356e-05, "loss": 0.0148, "step": 5910 }, { "epoch": 0.34484767286072115, "grad_norm": 0.3014158606529236, "learning_rate": 9.997691179659684e-05, "loss": 0.0173, "step": 5920 }, { "epoch": 0.34543018582163454, "grad_norm": 0.32131320238113403, "learning_rate": 9.997640663895815e-05, "loss": 0.0138, "step": 5930 }, { "epoch": 0.3460126987825479, "grad_norm": 0.2618979811668396, "learning_rate": 9.997589601597477e-05, "loss": 0.0132, "step": 5940 }, { "epoch": 0.3465952117434613, "grad_norm": 0.376512736082077, "learning_rate": 9.997537992770252e-05, "loss": 0.0146, "step": 5950 }, { "epoch": 0.3471777247043747, "grad_norm": 0.35291817784309387, "learning_rate": 9.997485837419788e-05, "loss": 0.0136, "step": 5960 }, { "epoch": 0.3477602376652881, "grad_norm": 0.28590676188468933, "learning_rate": 9.997433135551786e-05, "loss": 0.012, "step": 5970 }, { "epoch": 0.34834275062620146, "grad_norm": 0.426969051361084, "learning_rate": 9.997379887172009e-05, "loss": 0.0147, "step": 5980 }, { "epoch": 0.3489252635871148, "grad_norm": 0.4630492031574249, "learning_rate": 9.997326092286281e-05, "loss": 0.0125, "step": 5990 }, { "epoch": 0.34950777654802817, "grad_norm": 0.5408620238304138, "learning_rate": 9.997271750900486e-05, "loss": 0.0145, "step": 6000 }, { "epoch": 0.35009028950894155, "grad_norm": 0.21542291343212128, "learning_rate": 9.997216863020565e-05, "loss": 0.0119, "step": 6010 }, { "epoch": 0.35067280246985494, "grad_norm": 0.28633853793144226, "learning_rate": 9.99716142865252e-05, "loss": 0.0132, "step": 6020 }, { "epoch": 0.3512553154307683, "grad_norm": 0.44069865345954895, "learning_rate": 9.997105447802415e-05, "loss": 0.0116, "step": 6030 }, { "epoch": 0.3518378283916817, "grad_norm": 0.36303916573524475, "learning_rate": 9.997048920476373e-05, "loss": 0.0167, "step": 6040 }, { "epoch": 0.3524203413525951, "grad_norm": 0.3057125210762024, "learning_rate": 9.996991846680572e-05, "loss": 0.0133, "step": 6050 }, { "epoch": 0.3530028543135085, "grad_norm": 0.2529241740703583, "learning_rate": 9.996934226421257e-05, "loss": 0.0145, "step": 6060 }, { "epoch": 0.35358536727442186, "grad_norm": 0.25341537594795227, "learning_rate": 9.996876059704726e-05, "loss": 0.0121, "step": 6070 }, { "epoch": 0.35416788023533524, "grad_norm": 0.1897246390581131, "learning_rate": 9.996817346537343e-05, "loss": 0.0171, "step": 6080 }, { "epoch": 0.3547503931962486, "grad_norm": 0.1854855716228485, "learning_rate": 9.996758086925526e-05, "loss": 0.0145, "step": 6090 }, { "epoch": 0.355332906157162, "grad_norm": 0.3148235082626343, "learning_rate": 9.996698280875759e-05, "loss": 0.0112, "step": 6100 }, { "epoch": 0.3559154191180754, "grad_norm": 0.2586146891117096, "learning_rate": 9.99663792839458e-05, "loss": 0.0119, "step": 6110 }, { "epoch": 0.3564979320789888, "grad_norm": 0.40150609612464905, "learning_rate": 9.99657702948859e-05, "loss": 0.0169, "step": 6120 }, { "epoch": 0.35708044503990216, "grad_norm": 0.3099178075790405, "learning_rate": 9.996515584164448e-05, "loss": 0.0161, "step": 6130 }, { "epoch": 0.35766295800081555, "grad_norm": 0.3265916407108307, "learning_rate": 9.996453592428873e-05, "loss": 0.0118, "step": 6140 }, { "epoch": 0.3582454709617289, "grad_norm": 0.35054072737693787, "learning_rate": 9.996391054288646e-05, "loss": 0.0124, "step": 6150 }, { "epoch": 0.35882798392264226, "grad_norm": 0.3347821831703186, "learning_rate": 9.996327969750605e-05, "loss": 0.0155, "step": 6160 }, { "epoch": 0.35941049688355564, "grad_norm": 0.2547692358493805, "learning_rate": 9.996264338821649e-05, "loss": 0.0148, "step": 6170 }, { "epoch": 0.359993009844469, "grad_norm": 0.3955100178718567, "learning_rate": 9.996200161508735e-05, "loss": 0.0161, "step": 6180 }, { "epoch": 0.3605755228053824, "grad_norm": 0.5246358513832092, "learning_rate": 9.996135437818885e-05, "loss": 0.0162, "step": 6190 }, { "epoch": 0.3611580357662958, "grad_norm": 0.3335292935371399, "learning_rate": 9.996070167759175e-05, "loss": 0.0146, "step": 6200 }, { "epoch": 0.3617405487272092, "grad_norm": 0.3819826543331146, "learning_rate": 9.996004351336743e-05, "loss": 0.0131, "step": 6210 }, { "epoch": 0.36232306168812256, "grad_norm": 0.30899718403816223, "learning_rate": 9.995937988558785e-05, "loss": 0.0128, "step": 6220 }, { "epoch": 0.36290557464903594, "grad_norm": 0.2213035523891449, "learning_rate": 9.995871079432561e-05, "loss": 0.0143, "step": 6230 }, { "epoch": 0.36348808760994933, "grad_norm": 0.3285326361656189, "learning_rate": 9.995803623965389e-05, "loss": 0.0136, "step": 6240 }, { "epoch": 0.3640706005708627, "grad_norm": 0.2862640619277954, "learning_rate": 9.995735622164641e-05, "loss": 0.0135, "step": 6250 }, { "epoch": 0.3646531135317761, "grad_norm": 0.4550643563270569, "learning_rate": 9.995667074037758e-05, "loss": 0.015, "step": 6260 }, { "epoch": 0.3652356264926895, "grad_norm": 0.4130365550518036, "learning_rate": 9.995597979592232e-05, "loss": 0.015, "step": 6270 }, { "epoch": 0.36581813945360286, "grad_norm": 0.2879713475704193, "learning_rate": 9.995528338835625e-05, "loss": 0.0129, "step": 6280 }, { "epoch": 0.36640065241451625, "grad_norm": 0.25882580876350403, "learning_rate": 9.995458151775547e-05, "loss": 0.0154, "step": 6290 }, { "epoch": 0.3669831653754296, "grad_norm": 0.3760060667991638, "learning_rate": 9.995387418419677e-05, "loss": 0.0125, "step": 6300 }, { "epoch": 0.36756567833634296, "grad_norm": 0.26097920536994934, "learning_rate": 9.99531613877575e-05, "loss": 0.0155, "step": 6310 }, { "epoch": 0.36814819129725634, "grad_norm": 0.24474230408668518, "learning_rate": 9.995244312851559e-05, "loss": 0.012, "step": 6320 }, { "epoch": 0.36873070425816973, "grad_norm": 0.24583221971988678, "learning_rate": 9.995171940654961e-05, "loss": 0.0111, "step": 6330 }, { "epoch": 0.3693132172190831, "grad_norm": 0.4878476560115814, "learning_rate": 9.995099022193871e-05, "loss": 0.0131, "step": 6340 }, { "epoch": 0.3698957301799965, "grad_norm": 0.4050667881965637, "learning_rate": 9.995025557476261e-05, "loss": 0.0121, "step": 6350 }, { "epoch": 0.3704782431409099, "grad_norm": 0.3975124955177307, "learning_rate": 9.994951546510165e-05, "loss": 0.0148, "step": 6360 }, { "epoch": 0.37106075610182326, "grad_norm": 0.37463217973709106, "learning_rate": 9.994876989303679e-05, "loss": 0.0162, "step": 6370 }, { "epoch": 0.37164326906273665, "grad_norm": 0.3865721523761749, "learning_rate": 9.994801885864955e-05, "loss": 0.0143, "step": 6380 }, { "epoch": 0.37222578202365003, "grad_norm": 0.25392428040504456, "learning_rate": 9.994726236202205e-05, "loss": 0.0126, "step": 6390 }, { "epoch": 0.3728082949845634, "grad_norm": 0.2158985435962677, "learning_rate": 9.994650040323704e-05, "loss": 0.0121, "step": 6400 }, { "epoch": 0.3733908079454768, "grad_norm": 0.34085288643836975, "learning_rate": 9.994573298237784e-05, "loss": 0.0121, "step": 6410 }, { "epoch": 0.3739733209063902, "grad_norm": 0.27764540910720825, "learning_rate": 9.994496009952837e-05, "loss": 0.0146, "step": 6420 }, { "epoch": 0.37455583386730357, "grad_norm": 0.3376549184322357, "learning_rate": 9.994418175477316e-05, "loss": 0.0102, "step": 6430 }, { "epoch": 0.37513834682821695, "grad_norm": 0.20642155408859253, "learning_rate": 9.994339794819733e-05, "loss": 0.0124, "step": 6440 }, { "epoch": 0.37572085978913033, "grad_norm": 0.3245970606803894, "learning_rate": 9.994260867988658e-05, "loss": 0.0099, "step": 6450 }, { "epoch": 0.37630337275004366, "grad_norm": 0.16644133627414703, "learning_rate": 9.994181394992723e-05, "loss": 0.0102, "step": 6460 }, { "epoch": 0.37688588571095705, "grad_norm": 0.35917696356773376, "learning_rate": 9.994101375840618e-05, "loss": 0.0137, "step": 6470 }, { "epoch": 0.37746839867187043, "grad_norm": 0.42482873797416687, "learning_rate": 9.994020810541098e-05, "loss": 0.0117, "step": 6480 }, { "epoch": 0.3780509116327838, "grad_norm": 0.4261583089828491, "learning_rate": 9.99393969910297e-05, "loss": 0.0128, "step": 6490 }, { "epoch": 0.3786334245936972, "grad_norm": 0.30282726883888245, "learning_rate": 9.993858041535104e-05, "loss": 0.0123, "step": 6500 }, { "epoch": 0.3792159375546106, "grad_norm": 0.5752677917480469, "learning_rate": 9.99377583784643e-05, "loss": 0.0116, "step": 6510 }, { "epoch": 0.37979845051552397, "grad_norm": 0.3902672231197357, "learning_rate": 9.993693088045939e-05, "loss": 0.0149, "step": 6520 }, { "epoch": 0.38038096347643735, "grad_norm": 0.45048484206199646, "learning_rate": 9.99360979214268e-05, "loss": 0.0156, "step": 6530 }, { "epoch": 0.38096347643735073, "grad_norm": 0.3503977358341217, "learning_rate": 9.99352595014576e-05, "loss": 0.0142, "step": 6540 }, { "epoch": 0.3815459893982641, "grad_norm": 0.45726278424263, "learning_rate": 9.993441562064354e-05, "loss": 0.0142, "step": 6550 }, { "epoch": 0.3821285023591775, "grad_norm": 0.25229156017303467, "learning_rate": 9.993356627907685e-05, "loss": 0.0152, "step": 6560 }, { "epoch": 0.3827110153200909, "grad_norm": 0.39476731419563293, "learning_rate": 9.99327114768504e-05, "loss": 0.0163, "step": 6570 }, { "epoch": 0.38329352828100427, "grad_norm": 0.45719292759895325, "learning_rate": 9.99318512140577e-05, "loss": 0.0201, "step": 6580 }, { "epoch": 0.38387604124191765, "grad_norm": 0.46279555559158325, "learning_rate": 9.993098549079284e-05, "loss": 0.0136, "step": 6590 }, { "epoch": 0.38445855420283104, "grad_norm": 1.7729079723358154, "learning_rate": 9.993011430715047e-05, "loss": 0.0163, "step": 6600 }, { "epoch": 0.38504106716374437, "grad_norm": 0.5913901925086975, "learning_rate": 9.992923766322586e-05, "loss": 0.0156, "step": 6610 }, { "epoch": 0.38562358012465775, "grad_norm": 0.34780800342559814, "learning_rate": 9.99283555591149e-05, "loss": 0.0136, "step": 6620 }, { "epoch": 0.38620609308557113, "grad_norm": 0.2497500628232956, "learning_rate": 9.992746799491404e-05, "loss": 0.0176, "step": 6630 }, { "epoch": 0.3867886060464845, "grad_norm": 0.30821409821510315, "learning_rate": 9.992657497072033e-05, "loss": 0.0136, "step": 6640 }, { "epoch": 0.3873711190073979, "grad_norm": 0.2045343667268753, "learning_rate": 9.992567648663147e-05, "loss": 0.012, "step": 6650 }, { "epoch": 0.3879536319683113, "grad_norm": 0.3072163760662079, "learning_rate": 9.992477254274568e-05, "loss": 0.0121, "step": 6660 }, { "epoch": 0.38853614492922467, "grad_norm": 0.2853500545024872, "learning_rate": 9.992386313916183e-05, "loss": 0.0125, "step": 6670 }, { "epoch": 0.38911865789013805, "grad_norm": 0.2193383127450943, "learning_rate": 9.992294827597934e-05, "loss": 0.0139, "step": 6680 }, { "epoch": 0.38970117085105144, "grad_norm": 0.26137661933898926, "learning_rate": 9.992202795329831e-05, "loss": 0.0124, "step": 6690 }, { "epoch": 0.3902836838119648, "grad_norm": 0.35796210169792175, "learning_rate": 9.992110217121936e-05, "loss": 0.0131, "step": 6700 }, { "epoch": 0.3908661967728782, "grad_norm": 0.4279096722602844, "learning_rate": 9.992017092984372e-05, "loss": 0.0134, "step": 6710 }, { "epoch": 0.3914487097337916, "grad_norm": 0.3606835603713989, "learning_rate": 9.991923422927326e-05, "loss": 0.0135, "step": 6720 }, { "epoch": 0.392031222694705, "grad_norm": 0.39950212836265564, "learning_rate": 9.991829206961037e-05, "loss": 0.0145, "step": 6730 }, { "epoch": 0.39261373565561836, "grad_norm": 0.33112040162086487, "learning_rate": 9.991734445095813e-05, "loss": 0.0172, "step": 6740 }, { "epoch": 0.39319624861653174, "grad_norm": 0.3431642949581146, "learning_rate": 9.991639137342015e-05, "loss": 0.013, "step": 6750 }, { "epoch": 0.3937787615774451, "grad_norm": 0.3027171492576599, "learning_rate": 9.991543283710064e-05, "loss": 0.0127, "step": 6760 }, { "epoch": 0.39436127453835845, "grad_norm": 0.37543848156929016, "learning_rate": 9.991446884210445e-05, "loss": 0.0112, "step": 6770 }, { "epoch": 0.39494378749927184, "grad_norm": 0.4543565809726715, "learning_rate": 9.9913499388537e-05, "loss": 0.0125, "step": 6780 }, { "epoch": 0.3955263004601852, "grad_norm": 0.3261888921260834, "learning_rate": 9.99125244765043e-05, "loss": 0.0117, "step": 6790 }, { "epoch": 0.3961088134210986, "grad_norm": 0.37533843517303467, "learning_rate": 9.991154410611296e-05, "loss": 0.0159, "step": 6800 }, { "epoch": 0.396691326382012, "grad_norm": 0.35702934861183167, "learning_rate": 9.99105582774702e-05, "loss": 0.0095, "step": 6810 }, { "epoch": 0.3972738393429254, "grad_norm": 0.31491178274154663, "learning_rate": 9.990956699068384e-05, "loss": 0.0128, "step": 6820 }, { "epoch": 0.39785635230383876, "grad_norm": 0.28954946994781494, "learning_rate": 9.990857024586224e-05, "loss": 0.0112, "step": 6830 }, { "epoch": 0.39843886526475214, "grad_norm": 0.3444023132324219, "learning_rate": 9.990756804311446e-05, "loss": 0.0122, "step": 6840 }, { "epoch": 0.3990213782256655, "grad_norm": 0.31443479657173157, "learning_rate": 9.990656038255006e-05, "loss": 0.0096, "step": 6850 }, { "epoch": 0.3996038911865789, "grad_norm": 0.5514444708824158, "learning_rate": 9.990554726427926e-05, "loss": 0.0161, "step": 6860 }, { "epoch": 0.4001864041474923, "grad_norm": 0.4080047905445099, "learning_rate": 9.990452868841284e-05, "loss": 0.0158, "step": 6870 }, { "epoch": 0.4007689171084057, "grad_norm": 0.38745537400245667, "learning_rate": 9.99035046550622e-05, "loss": 0.0198, "step": 6880 }, { "epoch": 0.40135143006931906, "grad_norm": 0.28975197672843933, "learning_rate": 9.99024751643393e-05, "loss": 0.0144, "step": 6890 }, { "epoch": 0.40193394303023244, "grad_norm": 0.3628930151462555, "learning_rate": 9.990144021635677e-05, "loss": 0.0123, "step": 6900 }, { "epoch": 0.4025164559911458, "grad_norm": 0.5878703594207764, "learning_rate": 9.990039981122775e-05, "loss": 0.0149, "step": 6910 }, { "epoch": 0.40309896895205916, "grad_norm": 0.43749356269836426, "learning_rate": 9.989935394906602e-05, "loss": 0.0181, "step": 6920 }, { "epoch": 0.40368148191297254, "grad_norm": 0.3514690697193146, "learning_rate": 9.989830262998598e-05, "loss": 0.0112, "step": 6930 }, { "epoch": 0.4042639948738859, "grad_norm": 0.26862195134162903, "learning_rate": 9.989724585410259e-05, "loss": 0.0127, "step": 6940 }, { "epoch": 0.4048465078347993, "grad_norm": 0.2774137556552887, "learning_rate": 9.989618362153139e-05, "loss": 0.0176, "step": 6950 }, { "epoch": 0.4054290207957127, "grad_norm": 0.2702299654483795, "learning_rate": 9.989511593238859e-05, "loss": 0.0143, "step": 6960 }, { "epoch": 0.4060115337566261, "grad_norm": 0.39815017580986023, "learning_rate": 9.98940427867909e-05, "loss": 0.0157, "step": 6970 }, { "epoch": 0.40659404671753946, "grad_norm": 0.39254748821258545, "learning_rate": 9.989296418485573e-05, "loss": 0.0122, "step": 6980 }, { "epoch": 0.40717655967845284, "grad_norm": 0.3899990916252136, "learning_rate": 9.989188012670101e-05, "loss": 0.0159, "step": 6990 }, { "epoch": 0.4077590726393662, "grad_norm": 0.8622779250144958, "learning_rate": 9.989079061244528e-05, "loss": 0.0135, "step": 7000 }, { "epoch": 0.4083415856002796, "grad_norm": 0.34025272727012634, "learning_rate": 9.988969564220769e-05, "loss": 0.0141, "step": 7010 }, { "epoch": 0.408924098561193, "grad_norm": 0.27917760610580444, "learning_rate": 9.988859521610801e-05, "loss": 0.0129, "step": 7020 }, { "epoch": 0.4095066115221064, "grad_norm": 0.3322330415248871, "learning_rate": 9.988748933426656e-05, "loss": 0.0143, "step": 7030 }, { "epoch": 0.41008912448301976, "grad_norm": 0.1964351385831833, "learning_rate": 9.988637799680428e-05, "loss": 0.0157, "step": 7040 }, { "epoch": 0.41067163744393315, "grad_norm": 0.4311278164386749, "learning_rate": 9.98852612038427e-05, "loss": 0.0179, "step": 7050 }, { "epoch": 0.41125415040484653, "grad_norm": 0.3004930317401886, "learning_rate": 9.988413895550397e-05, "loss": 0.014, "step": 7060 }, { "epoch": 0.4118366633657599, "grad_norm": 0.4133649468421936, "learning_rate": 9.98830112519108e-05, "loss": 0.0138, "step": 7070 }, { "epoch": 0.41241917632667324, "grad_norm": 0.25272491574287415, "learning_rate": 9.98818780931865e-05, "loss": 0.0109, "step": 7080 }, { "epoch": 0.4130016892875866, "grad_norm": 0.23665858805179596, "learning_rate": 9.988073947945502e-05, "loss": 0.0125, "step": 7090 }, { "epoch": 0.4135842022485, "grad_norm": 0.33765506744384766, "learning_rate": 9.987959541084087e-05, "loss": 0.0129, "step": 7100 }, { "epoch": 0.4141667152094134, "grad_norm": 0.20850346982479095, "learning_rate": 9.987844588746915e-05, "loss": 0.0122, "step": 7110 }, { "epoch": 0.4147492281703268, "grad_norm": 0.41034919023513794, "learning_rate": 9.987729090946558e-05, "loss": 0.0146, "step": 7120 }, { "epoch": 0.41533174113124016, "grad_norm": 0.5243836045265198, "learning_rate": 9.987613047695647e-05, "loss": 0.0139, "step": 7130 }, { "epoch": 0.41591425409215355, "grad_norm": 0.5095720291137695, "learning_rate": 9.987496459006871e-05, "loss": 0.0175, "step": 7140 }, { "epoch": 0.41649676705306693, "grad_norm": 0.26434779167175293, "learning_rate": 9.987379324892982e-05, "loss": 0.0144, "step": 7150 }, { "epoch": 0.4170792800139803, "grad_norm": 0.3250783681869507, "learning_rate": 9.987261645366788e-05, "loss": 0.0154, "step": 7160 }, { "epoch": 0.4176617929748937, "grad_norm": 0.3645446002483368, "learning_rate": 9.987143420441158e-05, "loss": 0.0128, "step": 7170 }, { "epoch": 0.4182443059358071, "grad_norm": 0.2936994433403015, "learning_rate": 9.987024650129022e-05, "loss": 0.0137, "step": 7180 }, { "epoch": 0.41882681889672047, "grad_norm": 0.2474609911441803, "learning_rate": 9.986905334443368e-05, "loss": 0.0158, "step": 7190 }, { "epoch": 0.41940933185763385, "grad_norm": 0.3147972822189331, "learning_rate": 9.986785473397245e-05, "loss": 0.0144, "step": 7200 }, { "epoch": 0.41999184481854723, "grad_norm": 0.1938021332025528, "learning_rate": 9.98666506700376e-05, "loss": 0.0099, "step": 7210 }, { "epoch": 0.4205743577794606, "grad_norm": 0.41998913884162903, "learning_rate": 9.986544115276081e-05, "loss": 0.0162, "step": 7220 }, { "epoch": 0.421156870740374, "grad_norm": 0.4478550851345062, "learning_rate": 9.986422618227433e-05, "loss": 0.0128, "step": 7230 }, { "epoch": 0.42173938370128733, "grad_norm": 0.47049063444137573, "learning_rate": 9.986300575871106e-05, "loss": 0.0137, "step": 7240 }, { "epoch": 0.4223218966622007, "grad_norm": 0.7181037068367004, "learning_rate": 9.986177988220444e-05, "loss": 0.0139, "step": 7250 }, { "epoch": 0.4229044096231141, "grad_norm": 0.27280113101005554, "learning_rate": 9.986054855288856e-05, "loss": 0.0146, "step": 7260 }, { "epoch": 0.4234869225840275, "grad_norm": 0.29199454188346863, "learning_rate": 9.985931177089802e-05, "loss": 0.0133, "step": 7270 }, { "epoch": 0.42406943554494086, "grad_norm": 0.28355783224105835, "learning_rate": 9.985806953636814e-05, "loss": 0.0116, "step": 7280 }, { "epoch": 0.42465194850585425, "grad_norm": 0.4096960127353668, "learning_rate": 9.985682184943471e-05, "loss": 0.0124, "step": 7290 }, { "epoch": 0.42523446146676763, "grad_norm": 0.2560587525367737, "learning_rate": 9.98555687102342e-05, "loss": 0.0127, "step": 7300 }, { "epoch": 0.425816974427681, "grad_norm": 0.2811196744441986, "learning_rate": 9.985431011890367e-05, "loss": 0.0151, "step": 7310 }, { "epoch": 0.4263994873885944, "grad_norm": 0.21689921617507935, "learning_rate": 9.985304607558075e-05, "loss": 0.0109, "step": 7320 }, { "epoch": 0.4269820003495078, "grad_norm": 0.26157745718955994, "learning_rate": 9.985177658040364e-05, "loss": 0.0138, "step": 7330 }, { "epoch": 0.42756451331042117, "grad_norm": 0.36552417278289795, "learning_rate": 9.985050163351119e-05, "loss": 0.0117, "step": 7340 }, { "epoch": 0.42814702627133455, "grad_norm": 0.23382605612277985, "learning_rate": 9.984922123504286e-05, "loss": 0.0108, "step": 7350 }, { "epoch": 0.42872953923224794, "grad_norm": 0.299724280834198, "learning_rate": 9.984793538513862e-05, "loss": 0.0111, "step": 7360 }, { "epoch": 0.4293120521931613, "grad_norm": 0.2745811939239502, "learning_rate": 9.984664408393912e-05, "loss": 0.0122, "step": 7370 }, { "epoch": 0.4298945651540747, "grad_norm": 0.2735130786895752, "learning_rate": 9.984534733158556e-05, "loss": 0.0144, "step": 7380 }, { "epoch": 0.43047707811498803, "grad_norm": 0.2541470527648926, "learning_rate": 9.984404512821977e-05, "loss": 0.0145, "step": 7390 }, { "epoch": 0.4310595910759014, "grad_norm": 0.25931864976882935, "learning_rate": 9.984273747398411e-05, "loss": 0.0133, "step": 7400 }, { "epoch": 0.4316421040368148, "grad_norm": 0.40068352222442627, "learning_rate": 9.984142436902165e-05, "loss": 0.0135, "step": 7410 }, { "epoch": 0.4322246169977282, "grad_norm": 0.2905597388744354, "learning_rate": 9.984010581347596e-05, "loss": 0.0137, "step": 7420 }, { "epoch": 0.43280712995864157, "grad_norm": 0.2182629406452179, "learning_rate": 9.983878180749121e-05, "loss": 0.0125, "step": 7430 }, { "epoch": 0.43338964291955495, "grad_norm": 0.214482843875885, "learning_rate": 9.983745235121222e-05, "loss": 0.0144, "step": 7440 }, { "epoch": 0.43397215588046834, "grad_norm": 0.2951166033744812, "learning_rate": 9.983611744478438e-05, "loss": 0.0109, "step": 7450 }, { "epoch": 0.4345546688413817, "grad_norm": 0.25570884346961975, "learning_rate": 9.983477708835365e-05, "loss": 0.0145, "step": 7460 }, { "epoch": 0.4351371818022951, "grad_norm": 0.32933321595191956, "learning_rate": 9.983343128206664e-05, "loss": 0.0123, "step": 7470 }, { "epoch": 0.4357196947632085, "grad_norm": 0.31572362780570984, "learning_rate": 9.983208002607049e-05, "loss": 0.013, "step": 7480 }, { "epoch": 0.43630220772412187, "grad_norm": 0.28836414217948914, "learning_rate": 9.9830723320513e-05, "loss": 0.0151, "step": 7490 }, { "epoch": 0.43688472068503525, "grad_norm": 0.5403913259506226, "learning_rate": 9.982936116554254e-05, "loss": 0.0142, "step": 7500 }, { "epoch": 0.43746723364594864, "grad_norm": 0.31371814012527466, "learning_rate": 9.982799356130803e-05, "loss": 0.0118, "step": 7510 }, { "epoch": 0.438049746606862, "grad_norm": 0.35043516755104065, "learning_rate": 9.982662050795908e-05, "loss": 0.016, "step": 7520 }, { "epoch": 0.4386322595677754, "grad_norm": 0.34342852234840393, "learning_rate": 9.982524200564583e-05, "loss": 0.0139, "step": 7530 }, { "epoch": 0.4392147725286888, "grad_norm": 0.3126801550388336, "learning_rate": 9.982385805451901e-05, "loss": 0.0118, "step": 7540 }, { "epoch": 0.4397972854896021, "grad_norm": 0.3324005901813507, "learning_rate": 9.982246865472998e-05, "loss": 0.011, "step": 7550 }, { "epoch": 0.4403797984505155, "grad_norm": 0.3001226782798767, "learning_rate": 9.982107380643069e-05, "loss": 0.0115, "step": 7560 }, { "epoch": 0.4409623114114289, "grad_norm": 0.33128130435943604, "learning_rate": 9.981967350977368e-05, "loss": 0.0117, "step": 7570 }, { "epoch": 0.44154482437234227, "grad_norm": 0.3360348045825958, "learning_rate": 9.981826776491208e-05, "loss": 0.0149, "step": 7580 }, { "epoch": 0.44212733733325565, "grad_norm": 0.3642211854457855, "learning_rate": 9.98168565719996e-05, "loss": 0.0135, "step": 7590 }, { "epoch": 0.44270985029416904, "grad_norm": 0.2591760456562042, "learning_rate": 9.98154399311906e-05, "loss": 0.0134, "step": 7600 }, { "epoch": 0.4432923632550824, "grad_norm": 0.4087824523448944, "learning_rate": 9.981401784263997e-05, "loss": 0.0121, "step": 7610 }, { "epoch": 0.4438748762159958, "grad_norm": 0.27956900000572205, "learning_rate": 9.981259030650326e-05, "loss": 0.0112, "step": 7620 }, { "epoch": 0.4444573891769092, "grad_norm": 0.3172702193260193, "learning_rate": 9.981115732293655e-05, "loss": 0.0127, "step": 7630 }, { "epoch": 0.4450399021378226, "grad_norm": 0.36412346363067627, "learning_rate": 9.980971889209659e-05, "loss": 0.0142, "step": 7640 }, { "epoch": 0.44562241509873596, "grad_norm": 0.27572396397590637, "learning_rate": 9.980827501414064e-05, "loss": 0.0163, "step": 7650 }, { "epoch": 0.44620492805964934, "grad_norm": 0.2515240013599396, "learning_rate": 9.980682568922663e-05, "loss": 0.0142, "step": 7660 }, { "epoch": 0.4467874410205627, "grad_norm": 0.31563711166381836, "learning_rate": 9.980537091751304e-05, "loss": 0.0144, "step": 7670 }, { "epoch": 0.4473699539814761, "grad_norm": 0.3616774082183838, "learning_rate": 9.980391069915897e-05, "loss": 0.0133, "step": 7680 }, { "epoch": 0.4479524669423895, "grad_norm": 0.27960044145584106, "learning_rate": 9.98024450343241e-05, "loss": 0.0114, "step": 7690 }, { "epoch": 0.4485349799033028, "grad_norm": 0.4146794080734253, "learning_rate": 9.980097392316872e-05, "loss": 0.0122, "step": 7700 }, { "epoch": 0.4491174928642162, "grad_norm": 0.35154545307159424, "learning_rate": 9.97994973658537e-05, "loss": 0.0131, "step": 7710 }, { "epoch": 0.4497000058251296, "grad_norm": 0.40042373538017273, "learning_rate": 9.979801536254054e-05, "loss": 0.01, "step": 7720 }, { "epoch": 0.450282518786043, "grad_norm": 0.27050063014030457, "learning_rate": 9.979652791339127e-05, "loss": 0.0149, "step": 7730 }, { "epoch": 0.45086503174695636, "grad_norm": 0.3406389057636261, "learning_rate": 9.97950350185686e-05, "loss": 0.0107, "step": 7740 }, { "epoch": 0.45144754470786974, "grad_norm": 0.3652243912220001, "learning_rate": 9.979353667823574e-05, "loss": 0.0134, "step": 7750 }, { "epoch": 0.4520300576687831, "grad_norm": 0.4828316569328308, "learning_rate": 9.979203289255658e-05, "loss": 0.0127, "step": 7760 }, { "epoch": 0.4526125706296965, "grad_norm": 0.32583287358283997, "learning_rate": 9.979052366169557e-05, "loss": 0.0111, "step": 7770 }, { "epoch": 0.4531950835906099, "grad_norm": 0.4049088954925537, "learning_rate": 9.978900898581775e-05, "loss": 0.0146, "step": 7780 }, { "epoch": 0.4537775965515233, "grad_norm": 0.34072497487068176, "learning_rate": 9.978748886508875e-05, "loss": 0.0116, "step": 7790 }, { "epoch": 0.45436010951243666, "grad_norm": 0.3274933099746704, "learning_rate": 9.978596329967484e-05, "loss": 0.011, "step": 7800 }, { "epoch": 0.45494262247335004, "grad_norm": 0.30960801243782043, "learning_rate": 9.978443228974284e-05, "loss": 0.0133, "step": 7810 }, { "epoch": 0.45552513543426343, "grad_norm": 0.30783092975616455, "learning_rate": 9.978289583546015e-05, "loss": 0.0165, "step": 7820 }, { "epoch": 0.4561076483951768, "grad_norm": 0.2957223057746887, "learning_rate": 9.978135393699484e-05, "loss": 0.0161, "step": 7830 }, { "epoch": 0.4566901613560902, "grad_norm": 0.3190397620201111, "learning_rate": 9.977980659451548e-05, "loss": 0.0137, "step": 7840 }, { "epoch": 0.4572726743170036, "grad_norm": 0.39251628518104553, "learning_rate": 9.977825380819135e-05, "loss": 0.0122, "step": 7850 }, { "epoch": 0.4578551872779169, "grad_norm": 0.6569514870643616, "learning_rate": 9.97766955781922e-05, "loss": 0.0102, "step": 7860 }, { "epoch": 0.4584377002388303, "grad_norm": 0.19619843363761902, "learning_rate": 9.977513190468848e-05, "loss": 0.0121, "step": 7870 }, { "epoch": 0.4590202131997437, "grad_norm": 0.3167249858379364, "learning_rate": 9.977356278785116e-05, "loss": 0.0133, "step": 7880 }, { "epoch": 0.45960272616065706, "grad_norm": 0.3681126832962036, "learning_rate": 9.977198822785184e-05, "loss": 0.0128, "step": 7890 }, { "epoch": 0.46018523912157044, "grad_norm": 0.36865857243537903, "learning_rate": 9.977040822486273e-05, "loss": 0.0139, "step": 7900 }, { "epoch": 0.46076775208248383, "grad_norm": 0.2314935177564621, "learning_rate": 9.97688227790566e-05, "loss": 0.0141, "step": 7910 }, { "epoch": 0.4613502650433972, "grad_norm": 0.2377949059009552, "learning_rate": 9.976723189060684e-05, "loss": 0.0116, "step": 7920 }, { "epoch": 0.4619327780043106, "grad_norm": 0.39535704255104065, "learning_rate": 9.976563555968742e-05, "loss": 0.014, "step": 7930 }, { "epoch": 0.462515290965224, "grad_norm": 0.2843906283378601, "learning_rate": 9.976403378647292e-05, "loss": 0.0128, "step": 7940 }, { "epoch": 0.46309780392613736, "grad_norm": 0.21500226855278015, "learning_rate": 9.97624265711385e-05, "loss": 0.0169, "step": 7950 }, { "epoch": 0.46368031688705075, "grad_norm": 0.3784261643886566, "learning_rate": 9.976081391385993e-05, "loss": 0.0165, "step": 7960 }, { "epoch": 0.46426282984796413, "grad_norm": 0.33142247796058655, "learning_rate": 9.975919581481356e-05, "loss": 0.0134, "step": 7970 }, { "epoch": 0.4648453428088775, "grad_norm": 0.34239137172698975, "learning_rate": 9.975757227417634e-05, "loss": 0.0196, "step": 7980 }, { "epoch": 0.4654278557697909, "grad_norm": 0.2868070602416992, "learning_rate": 9.975594329212586e-05, "loss": 0.015, "step": 7990 }, { "epoch": 0.4660103687307043, "grad_norm": 0.28554901480674744, "learning_rate": 9.97543088688402e-05, "loss": 0.0145, "step": 8000 }, { "epoch": 0.4665928816916176, "grad_norm": 0.5095058679580688, "learning_rate": 9.975266900449814e-05, "loss": 0.0178, "step": 8010 }, { "epoch": 0.467175394652531, "grad_norm": 0.31355223059654236, "learning_rate": 9.975102369927898e-05, "loss": 0.0151, "step": 8020 }, { "epoch": 0.4677579076134444, "grad_norm": 0.19202469289302826, "learning_rate": 9.974937295336269e-05, "loss": 0.0153, "step": 8030 }, { "epoch": 0.46834042057435776, "grad_norm": 0.23678675293922424, "learning_rate": 9.974771676692975e-05, "loss": 0.0121, "step": 8040 }, { "epoch": 0.46892293353527115, "grad_norm": 0.3065840005874634, "learning_rate": 9.974605514016131e-05, "loss": 0.0103, "step": 8050 }, { "epoch": 0.46950544649618453, "grad_norm": 0.30886998772621155, "learning_rate": 9.974438807323907e-05, "loss": 0.0145, "step": 8060 }, { "epoch": 0.4700879594570979, "grad_norm": 0.3345741629600525, "learning_rate": 9.974271556634535e-05, "loss": 0.0113, "step": 8070 }, { "epoch": 0.4706704724180113, "grad_norm": 0.3922256529331207, "learning_rate": 9.974103761966302e-05, "loss": 0.0131, "step": 8080 }, { "epoch": 0.4712529853789247, "grad_norm": 0.20716457068920135, "learning_rate": 9.973935423337563e-05, "loss": 0.01, "step": 8090 }, { "epoch": 0.47183549833983807, "grad_norm": 0.3493630886077881, "learning_rate": 9.973766540766722e-05, "loss": 0.0103, "step": 8100 }, { "epoch": 0.47241801130075145, "grad_norm": 0.3464565873146057, "learning_rate": 9.97359711427225e-05, "loss": 0.0103, "step": 8110 }, { "epoch": 0.47300052426166483, "grad_norm": 0.30762287974357605, "learning_rate": 9.973427143872677e-05, "loss": 0.0124, "step": 8120 }, { "epoch": 0.4735830372225782, "grad_norm": 0.19957633316516876, "learning_rate": 9.973256629586589e-05, "loss": 0.0092, "step": 8130 }, { "epoch": 0.4741655501834916, "grad_norm": 0.30018189549446106, "learning_rate": 9.973085571432632e-05, "loss": 0.014, "step": 8140 }, { "epoch": 0.474748063144405, "grad_norm": 0.35634303092956543, "learning_rate": 9.972913969429513e-05, "loss": 0.0132, "step": 8150 }, { "epoch": 0.47533057610531837, "grad_norm": 0.3178057372570038, "learning_rate": 9.972741823596e-05, "loss": 0.013, "step": 8160 }, { "epoch": 0.4759130890662317, "grad_norm": 0.34364962577819824, "learning_rate": 9.972569133950917e-05, "loss": 0.0119, "step": 8170 }, { "epoch": 0.4764956020271451, "grad_norm": 0.3316066563129425, "learning_rate": 9.972395900513151e-05, "loss": 0.0125, "step": 8180 }, { "epoch": 0.47707811498805847, "grad_norm": 0.25341787934303284, "learning_rate": 9.972222123301645e-05, "loss": 0.0137, "step": 8190 }, { "epoch": 0.47766062794897185, "grad_norm": 0.39359617233276367, "learning_rate": 9.972047802335403e-05, "loss": 0.0123, "step": 8200 }, { "epoch": 0.47824314090988523, "grad_norm": 0.23918533325195312, "learning_rate": 9.971872937633488e-05, "loss": 0.0122, "step": 8210 }, { "epoch": 0.4788256538707986, "grad_norm": 0.3521546423435211, "learning_rate": 9.971697529215024e-05, "loss": 0.011, "step": 8220 }, { "epoch": 0.479408166831712, "grad_norm": 0.5300173759460449, "learning_rate": 9.971521577099192e-05, "loss": 0.0122, "step": 8230 }, { "epoch": 0.4799906797926254, "grad_norm": 0.4083388149738312, "learning_rate": 9.971345081305236e-05, "loss": 0.0102, "step": 8240 }, { "epoch": 0.48057319275353877, "grad_norm": 0.3394054174423218, "learning_rate": 9.971168041852456e-05, "loss": 0.0124, "step": 8250 }, { "epoch": 0.48115570571445215, "grad_norm": 0.33593088388442993, "learning_rate": 9.970990458760215e-05, "loss": 0.0118, "step": 8260 }, { "epoch": 0.48173821867536554, "grad_norm": 0.3859274089336395, "learning_rate": 9.970812332047929e-05, "loss": 0.0165, "step": 8270 }, { "epoch": 0.4823207316362789, "grad_norm": 0.2736520767211914, "learning_rate": 9.97063366173508e-05, "loss": 0.0156, "step": 8280 }, { "epoch": 0.4829032445971923, "grad_norm": 0.3035624027252197, "learning_rate": 9.970454447841207e-05, "loss": 0.0134, "step": 8290 }, { "epoch": 0.4834857575581057, "grad_norm": 0.245099738240242, "learning_rate": 9.970274690385909e-05, "loss": 0.0107, "step": 8300 }, { "epoch": 0.4840682705190191, "grad_norm": 0.4431707262992859, "learning_rate": 9.970094389388844e-05, "loss": 0.0168, "step": 8310 }, { "epoch": 0.4846507834799324, "grad_norm": 0.5235276818275452, "learning_rate": 9.969913544869728e-05, "loss": 0.0135, "step": 8320 }, { "epoch": 0.4852332964408458, "grad_norm": 0.37572988867759705, "learning_rate": 9.96973215684834e-05, "loss": 0.0137, "step": 8330 }, { "epoch": 0.48581580940175917, "grad_norm": 0.38715246319770813, "learning_rate": 9.969550225344513e-05, "loss": 0.0129, "step": 8340 }, { "epoch": 0.48639832236267255, "grad_norm": 0.3668060600757599, "learning_rate": 9.969367750378147e-05, "loss": 0.0144, "step": 8350 }, { "epoch": 0.48698083532358594, "grad_norm": 0.35157275199890137, "learning_rate": 9.969184731969194e-05, "loss": 0.0134, "step": 8360 }, { "epoch": 0.4875633482844993, "grad_norm": 0.40042853355407715, "learning_rate": 9.96900117013767e-05, "loss": 0.0122, "step": 8370 }, { "epoch": 0.4881458612454127, "grad_norm": 0.21145382523536682, "learning_rate": 9.96881706490365e-05, "loss": 0.0114, "step": 8380 }, { "epoch": 0.4887283742063261, "grad_norm": 0.2801576554775238, "learning_rate": 9.968632416287265e-05, "loss": 0.0131, "step": 8390 }, { "epoch": 0.48931088716723947, "grad_norm": 0.21981655061244965, "learning_rate": 9.96844722430871e-05, "loss": 0.0092, "step": 8400 }, { "epoch": 0.48989340012815286, "grad_norm": 0.4167364835739136, "learning_rate": 9.968261488988235e-05, "loss": 0.0117, "step": 8410 }, { "epoch": 0.49047591308906624, "grad_norm": 0.4426529109477997, "learning_rate": 9.968075210346155e-05, "loss": 0.0114, "step": 8420 }, { "epoch": 0.4910584260499796, "grad_norm": 0.40926066040992737, "learning_rate": 9.967888388402839e-05, "loss": 0.0112, "step": 8430 }, { "epoch": 0.491640939010893, "grad_norm": 0.33130383491516113, "learning_rate": 9.967701023178717e-05, "loss": 0.0104, "step": 8440 }, { "epoch": 0.4922234519718064, "grad_norm": 0.3464617431163788, "learning_rate": 9.967513114694282e-05, "loss": 0.0109, "step": 8450 }, { "epoch": 0.4928059649327198, "grad_norm": 0.4233251214027405, "learning_rate": 9.967324662970079e-05, "loss": 0.0104, "step": 8460 }, { "epoch": 0.49338847789363316, "grad_norm": 0.39657196402549744, "learning_rate": 9.96713566802672e-05, "loss": 0.0119, "step": 8470 }, { "epoch": 0.4939709908545465, "grad_norm": 0.2708352506160736, "learning_rate": 9.966946129884873e-05, "loss": 0.0168, "step": 8480 }, { "epoch": 0.49455350381545987, "grad_norm": 0.3419860005378723, "learning_rate": 9.966756048565265e-05, "loss": 0.0135, "step": 8490 }, { "epoch": 0.49513601677637326, "grad_norm": 0.3863567113876343, "learning_rate": 9.966565424088681e-05, "loss": 0.0111, "step": 8500 }, { "epoch": 0.49571852973728664, "grad_norm": 0.3077118992805481, "learning_rate": 9.96637425647597e-05, "loss": 0.014, "step": 8510 }, { "epoch": 0.4963010426982, "grad_norm": 0.36019131541252136, "learning_rate": 9.966182545748038e-05, "loss": 0.0117, "step": 8520 }, { "epoch": 0.4968835556591134, "grad_norm": 0.306153267621994, "learning_rate": 9.96599029192585e-05, "loss": 0.0107, "step": 8530 }, { "epoch": 0.4974660686200268, "grad_norm": 0.24583961069583893, "learning_rate": 9.965797495030428e-05, "loss": 0.0111, "step": 8540 }, { "epoch": 0.4980485815809402, "grad_norm": 0.31469887495040894, "learning_rate": 9.96560415508286e-05, "loss": 0.0132, "step": 8550 }, { "epoch": 0.49863109454185356, "grad_norm": 0.1908465325832367, "learning_rate": 9.965410272104286e-05, "loss": 0.0106, "step": 8560 }, { "epoch": 0.49921360750276694, "grad_norm": 0.29951563477516174, "learning_rate": 9.96521584611591e-05, "loss": 0.0132, "step": 8570 }, { "epoch": 0.4997961204636803, "grad_norm": 0.27624136209487915, "learning_rate": 9.965020877138994e-05, "loss": 0.0111, "step": 8580 }, { "epoch": 0.5003786334245937, "grad_norm": 0.25615614652633667, "learning_rate": 9.964825365194861e-05, "loss": 0.0102, "step": 8590 }, { "epoch": 0.500961146385507, "grad_norm": 0.25534650683403015, "learning_rate": 9.96462931030489e-05, "loss": 0.0147, "step": 8600 }, { "epoch": 0.5015436593464204, "grad_norm": 0.4359297752380371, "learning_rate": 9.96443271249052e-05, "loss": 0.0131, "step": 8610 }, { "epoch": 0.5021261723073338, "grad_norm": 0.44120779633522034, "learning_rate": 9.964235571773255e-05, "loss": 0.0123, "step": 8620 }, { "epoch": 0.5027086852682472, "grad_norm": 0.3388446867465973, "learning_rate": 9.96403788817465e-05, "loss": 0.0121, "step": 8630 }, { "epoch": 0.5032911982291606, "grad_norm": 0.351571649312973, "learning_rate": 9.963839661716325e-05, "loss": 0.0106, "step": 8640 }, { "epoch": 0.503873711190074, "grad_norm": 0.23587334156036377, "learning_rate": 9.963640892419958e-05, "loss": 0.0139, "step": 8650 }, { "epoch": 0.5044562241509873, "grad_norm": 0.30069562792778015, "learning_rate": 9.963441580307286e-05, "loss": 0.012, "step": 8660 }, { "epoch": 0.5050387371119007, "grad_norm": 0.49241095781326294, "learning_rate": 9.963241725400104e-05, "loss": 0.0127, "step": 8670 }, { "epoch": 0.5056212500728141, "grad_norm": 0.33179011940956116, "learning_rate": 9.963041327720271e-05, "loss": 0.0152, "step": 8680 }, { "epoch": 0.5062037630337275, "grad_norm": 0.3685012757778168, "learning_rate": 9.962840387289697e-05, "loss": 0.0126, "step": 8690 }, { "epoch": 0.5067862759946409, "grad_norm": 0.4016934037208557, "learning_rate": 9.962638904130363e-05, "loss": 0.0118, "step": 8700 }, { "epoch": 0.5073687889555543, "grad_norm": 0.3884336054325104, "learning_rate": 9.962436878264298e-05, "loss": 0.0107, "step": 8710 }, { "epoch": 0.5079513019164676, "grad_norm": 0.5046244859695435, "learning_rate": 9.962234309713598e-05, "loss": 0.0118, "step": 8720 }, { "epoch": 0.508533814877381, "grad_norm": 0.35786616802215576, "learning_rate": 9.962031198500414e-05, "loss": 0.0137, "step": 8730 }, { "epoch": 0.5091163278382944, "grad_norm": 0.40751317143440247, "learning_rate": 9.961827544646958e-05, "loss": 0.012, "step": 8740 }, { "epoch": 0.5096988407992078, "grad_norm": 0.29337751865386963, "learning_rate": 9.961623348175501e-05, "loss": 0.0127, "step": 8750 }, { "epoch": 0.5102813537601212, "grad_norm": 0.4297778308391571, "learning_rate": 9.961418609108377e-05, "loss": 0.0143, "step": 8760 }, { "epoch": 0.5108638667210346, "grad_norm": 0.3457382321357727, "learning_rate": 9.961213327467971e-05, "loss": 0.0107, "step": 8770 }, { "epoch": 0.511446379681948, "grad_norm": 0.43843671679496765, "learning_rate": 9.961007503276736e-05, "loss": 0.0152, "step": 8780 }, { "epoch": 0.5120288926428613, "grad_norm": 0.4600363075733185, "learning_rate": 9.960801136557179e-05, "loss": 0.0149, "step": 8790 }, { "epoch": 0.5126114056037747, "grad_norm": 0.5583530068397522, "learning_rate": 9.960594227331866e-05, "loss": 0.0123, "step": 8800 }, { "epoch": 0.5131939185646881, "grad_norm": 0.25803712010383606, "learning_rate": 9.960386775623429e-05, "loss": 0.0167, "step": 8810 }, { "epoch": 0.5137764315256015, "grad_norm": 0.6141430139541626, "learning_rate": 9.96017878145455e-05, "loss": 0.0151, "step": 8820 }, { "epoch": 0.5143589444865149, "grad_norm": 0.6058645844459534, "learning_rate": 9.959970244847977e-05, "loss": 0.0135, "step": 8830 }, { "epoch": 0.5149414574474283, "grad_norm": 0.533699631690979, "learning_rate": 9.959761165826518e-05, "loss": 0.0116, "step": 8840 }, { "epoch": 0.5155239704083416, "grad_norm": 0.5360773205757141, "learning_rate": 9.959551544413033e-05, "loss": 0.0149, "step": 8850 }, { "epoch": 0.516106483369255, "grad_norm": 0.2782592475414276, "learning_rate": 9.959341380630448e-05, "loss": 0.0141, "step": 8860 }, { "epoch": 0.5166889963301683, "grad_norm": 0.34047409892082214, "learning_rate": 9.959130674501746e-05, "loss": 0.0107, "step": 8870 }, { "epoch": 0.5172715092910817, "grad_norm": 0.22537291049957275, "learning_rate": 9.958919426049968e-05, "loss": 0.0122, "step": 8880 }, { "epoch": 0.5178540222519951, "grad_norm": 0.23899054527282715, "learning_rate": 9.958707635298219e-05, "loss": 0.0109, "step": 8890 }, { "epoch": 0.5184365352129084, "grad_norm": 0.4367603361606598, "learning_rate": 9.958495302269657e-05, "loss": 0.0173, "step": 8900 }, { "epoch": 0.5190190481738218, "grad_norm": 0.3084893524646759, "learning_rate": 9.958282426987503e-05, "loss": 0.0097, "step": 8910 }, { "epoch": 0.5196015611347352, "grad_norm": 0.28005778789520264, "learning_rate": 9.95806900947504e-05, "loss": 0.0112, "step": 8920 }, { "epoch": 0.5201840740956486, "grad_norm": 0.2670397460460663, "learning_rate": 9.957855049755604e-05, "loss": 0.0086, "step": 8930 }, { "epoch": 0.520766587056562, "grad_norm": 0.34332460165023804, "learning_rate": 9.957640547852593e-05, "loss": 0.0109, "step": 8940 }, { "epoch": 0.5213491000174754, "grad_norm": 0.2118535041809082, "learning_rate": 9.957425503789466e-05, "loss": 0.0136, "step": 8950 }, { "epoch": 0.5219316129783887, "grad_norm": 0.3772883415222168, "learning_rate": 9.957209917589738e-05, "loss": 0.0139, "step": 8960 }, { "epoch": 0.5225141259393021, "grad_norm": 0.2075585573911667, "learning_rate": 9.956993789276987e-05, "loss": 0.0086, "step": 8970 }, { "epoch": 0.5230966389002155, "grad_norm": 0.2912015914916992, "learning_rate": 9.956777118874847e-05, "loss": 0.014, "step": 8980 }, { "epoch": 0.5236791518611289, "grad_norm": 0.34067502617836, "learning_rate": 9.956559906407016e-05, "loss": 0.012, "step": 8990 }, { "epoch": 0.5242616648220423, "grad_norm": 0.41253411769866943, "learning_rate": 9.956342151897245e-05, "loss": 0.0172, "step": 9000 }, { "epoch": 0.5248441777829557, "grad_norm": 0.22164370119571686, "learning_rate": 9.956123855369346e-05, "loss": 0.0117, "step": 9010 }, { "epoch": 0.525426690743869, "grad_norm": 0.27233901619911194, "learning_rate": 9.955905016847196e-05, "loss": 0.0093, "step": 9020 }, { "epoch": 0.5260092037047824, "grad_norm": 0.2521682679653168, "learning_rate": 9.955685636354723e-05, "loss": 0.0106, "step": 9030 }, { "epoch": 0.5265917166656958, "grad_norm": 0.20766761898994446, "learning_rate": 9.95546571391592e-05, "loss": 0.011, "step": 9040 }, { "epoch": 0.5271742296266092, "grad_norm": 0.2995697855949402, "learning_rate": 9.955245249554837e-05, "loss": 0.0123, "step": 9050 }, { "epoch": 0.5277567425875226, "grad_norm": 0.19737578928470612, "learning_rate": 9.955024243295582e-05, "loss": 0.0126, "step": 9060 }, { "epoch": 0.528339255548436, "grad_norm": 0.3154892325401306, "learning_rate": 9.954802695162328e-05, "loss": 0.0133, "step": 9070 }, { "epoch": 0.5289217685093494, "grad_norm": 0.3293045461177826, "learning_rate": 9.954580605179302e-05, "loss": 0.0141, "step": 9080 }, { "epoch": 0.5295042814702627, "grad_norm": 0.31475383043289185, "learning_rate": 9.954357973370788e-05, "loss": 0.0114, "step": 9090 }, { "epoch": 0.5300867944311761, "grad_norm": 0.23194412887096405, "learning_rate": 9.954134799761135e-05, "loss": 0.0105, "step": 9100 }, { "epoch": 0.5306693073920895, "grad_norm": 0.3737161159515381, "learning_rate": 9.953911084374748e-05, "loss": 0.0146, "step": 9110 }, { "epoch": 0.5312518203530029, "grad_norm": 0.3487817049026489, "learning_rate": 9.953686827236093e-05, "loss": 0.0117, "step": 9120 }, { "epoch": 0.5318343333139163, "grad_norm": 0.3458276391029358, "learning_rate": 9.953462028369695e-05, "loss": 0.0085, "step": 9130 }, { "epoch": 0.5324168462748297, "grad_norm": 0.2672206163406372, "learning_rate": 9.953236687800136e-05, "loss": 0.0121, "step": 9140 }, { "epoch": 0.532999359235743, "grad_norm": 0.4567156732082367, "learning_rate": 9.95301080555206e-05, "loss": 0.0139, "step": 9150 }, { "epoch": 0.5335818721966564, "grad_norm": 0.3230440616607666, "learning_rate": 9.952784381650171e-05, "loss": 0.0113, "step": 9160 }, { "epoch": 0.5341643851575698, "grad_norm": 0.34147533774375916, "learning_rate": 9.952557416119226e-05, "loss": 0.0141, "step": 9170 }, { "epoch": 0.5347468981184831, "grad_norm": 0.4173109531402588, "learning_rate": 9.95232990898405e-05, "loss": 0.0136, "step": 9180 }, { "epoch": 0.5353294110793965, "grad_norm": 0.3162938058376312, "learning_rate": 9.95210186026952e-05, "loss": 0.0105, "step": 9190 }, { "epoch": 0.5359119240403099, "grad_norm": 0.29883965849876404, "learning_rate": 9.951873270000576e-05, "loss": 0.0148, "step": 9200 }, { "epoch": 0.5364944370012232, "grad_norm": 0.40376511216163635, "learning_rate": 9.951644138202216e-05, "loss": 0.0131, "step": 9210 }, { "epoch": 0.5370769499621366, "grad_norm": 0.28043410181999207, "learning_rate": 9.951414464899498e-05, "loss": 0.0175, "step": 9220 }, { "epoch": 0.53765946292305, "grad_norm": 0.27490633726119995, "learning_rate": 9.951184250117538e-05, "loss": 0.0117, "step": 9230 }, { "epoch": 0.5382419758839634, "grad_norm": 0.3838854432106018, "learning_rate": 9.950953493881513e-05, "loss": 0.011, "step": 9240 }, { "epoch": 0.5388244888448768, "grad_norm": 0.3265721797943115, "learning_rate": 9.950722196216658e-05, "loss": 0.0138, "step": 9250 }, { "epoch": 0.5394070018057902, "grad_norm": 0.42262333631515503, "learning_rate": 9.950490357148265e-05, "loss": 0.0104, "step": 9260 }, { "epoch": 0.5399895147667035, "grad_norm": 0.38992148637771606, "learning_rate": 9.950257976701692e-05, "loss": 0.0159, "step": 9270 }, { "epoch": 0.5405720277276169, "grad_norm": 0.3030673563480377, "learning_rate": 9.950025054902348e-05, "loss": 0.0118, "step": 9280 }, { "epoch": 0.5411545406885303, "grad_norm": 0.32681745290756226, "learning_rate": 9.949791591775706e-05, "loss": 0.0105, "step": 9290 }, { "epoch": 0.5417370536494437, "grad_norm": 0.4332732856273651, "learning_rate": 9.949557587347298e-05, "loss": 0.0127, "step": 9300 }, { "epoch": 0.5423195666103571, "grad_norm": 0.23330433666706085, "learning_rate": 9.949323041642713e-05, "loss": 0.0133, "step": 9310 }, { "epoch": 0.5429020795712705, "grad_norm": 0.2538306713104248, "learning_rate": 9.949087954687602e-05, "loss": 0.0139, "step": 9320 }, { "epoch": 0.5434845925321838, "grad_norm": 0.4203927218914032, "learning_rate": 9.948852326507672e-05, "loss": 0.0125, "step": 9330 }, { "epoch": 0.5440671054930972, "grad_norm": 0.27585768699645996, "learning_rate": 9.948616157128694e-05, "loss": 0.0123, "step": 9340 }, { "epoch": 0.5446496184540106, "grad_norm": 0.3246493339538574, "learning_rate": 9.948379446576493e-05, "loss": 0.0104, "step": 9350 }, { "epoch": 0.545232131414924, "grad_norm": 0.4874025285243988, "learning_rate": 9.948142194876952e-05, "loss": 0.0128, "step": 9360 }, { "epoch": 0.5458146443758374, "grad_norm": 0.3689294755458832, "learning_rate": 9.947904402056024e-05, "loss": 0.0177, "step": 9370 }, { "epoch": 0.5463971573367508, "grad_norm": 0.36121460795402527, "learning_rate": 9.947666068139708e-05, "loss": 0.0142, "step": 9380 }, { "epoch": 0.5469796702976641, "grad_norm": 0.3328753709793091, "learning_rate": 9.947427193154071e-05, "loss": 0.0146, "step": 9390 }, { "epoch": 0.5475621832585775, "grad_norm": 0.3952317535877228, "learning_rate": 9.947187777125233e-05, "loss": 0.0107, "step": 9400 }, { "epoch": 0.5481446962194909, "grad_norm": 0.40359553694725037, "learning_rate": 9.946947820079377e-05, "loss": 0.012, "step": 9410 }, { "epoch": 0.5487272091804043, "grad_norm": 0.4122847020626068, "learning_rate": 9.946707322042747e-05, "loss": 0.0126, "step": 9420 }, { "epoch": 0.5493097221413177, "grad_norm": 0.2626955807209015, "learning_rate": 9.94646628304164e-05, "loss": 0.0104, "step": 9430 }, { "epoch": 0.5498922351022311, "grad_norm": 0.24648070335388184, "learning_rate": 9.946224703102418e-05, "loss": 0.0152, "step": 9440 }, { "epoch": 0.5504747480631444, "grad_norm": 0.1977761834859848, "learning_rate": 9.945982582251498e-05, "loss": 0.0105, "step": 9450 }, { "epoch": 0.5510572610240578, "grad_norm": 0.31380653381347656, "learning_rate": 9.94573992051536e-05, "loss": 0.0154, "step": 9460 }, { "epoch": 0.5516397739849712, "grad_norm": 0.22671660780906677, "learning_rate": 9.94549671792054e-05, "loss": 0.0126, "step": 9470 }, { "epoch": 0.5522222869458846, "grad_norm": 0.27327343821525574, "learning_rate": 9.945252974493635e-05, "loss": 0.0116, "step": 9480 }, { "epoch": 0.5528047999067979, "grad_norm": 0.4068441689014435, "learning_rate": 9.9450086902613e-05, "loss": 0.0141, "step": 9490 }, { "epoch": 0.5533873128677113, "grad_norm": 0.4184415638446808, "learning_rate": 9.944763865250248e-05, "loss": 0.015, "step": 9500 }, { "epoch": 0.5539698258286246, "grad_norm": 0.3811226189136505, "learning_rate": 9.944518499487254e-05, "loss": 0.0136, "step": 9510 }, { "epoch": 0.554552338789538, "grad_norm": 0.4261406362056732, "learning_rate": 9.944272592999151e-05, "loss": 0.0162, "step": 9520 }, { "epoch": 0.5551348517504514, "grad_norm": 0.27167704701423645, "learning_rate": 9.94402614581283e-05, "loss": 0.013, "step": 9530 }, { "epoch": 0.5557173647113648, "grad_norm": 0.2374947965145111, "learning_rate": 9.943779157955244e-05, "loss": 0.0153, "step": 9540 }, { "epoch": 0.5562998776722782, "grad_norm": 0.25550156831741333, "learning_rate": 9.943531629453403e-05, "loss": 0.0152, "step": 9550 }, { "epoch": 0.5568823906331916, "grad_norm": 0.2597905993461609, "learning_rate": 9.943283560334375e-05, "loss": 0.0121, "step": 9560 }, { "epoch": 0.5574649035941049, "grad_norm": 0.2636127471923828, "learning_rate": 9.943034950625288e-05, "loss": 0.0137, "step": 9570 }, { "epoch": 0.5580474165550183, "grad_norm": 0.3429103493690491, "learning_rate": 9.942785800353332e-05, "loss": 0.0124, "step": 9580 }, { "epoch": 0.5586299295159317, "grad_norm": 0.4438924491405487, "learning_rate": 9.942536109545751e-05, "loss": 0.0112, "step": 9590 }, { "epoch": 0.5592124424768451, "grad_norm": 0.3489965796470642, "learning_rate": 9.942285878229853e-05, "loss": 0.0152, "step": 9600 }, { "epoch": 0.5597949554377585, "grad_norm": 0.30171725153923035, "learning_rate": 9.942035106433001e-05, "loss": 0.0144, "step": 9610 }, { "epoch": 0.5603774683986719, "grad_norm": 0.39304476976394653, "learning_rate": 9.94178379418262e-05, "loss": 0.014, "step": 9620 }, { "epoch": 0.5609599813595852, "grad_norm": 0.49399781227111816, "learning_rate": 9.941531941506194e-05, "loss": 0.0162, "step": 9630 }, { "epoch": 0.5615424943204986, "grad_norm": 0.21328000724315643, "learning_rate": 9.941279548431263e-05, "loss": 0.0105, "step": 9640 }, { "epoch": 0.562125007281412, "grad_norm": 0.37901148200035095, "learning_rate": 9.941026614985431e-05, "loss": 0.0158, "step": 9650 }, { "epoch": 0.5627075202423254, "grad_norm": 0.5294866561889648, "learning_rate": 9.940773141196357e-05, "loss": 0.013, "step": 9660 }, { "epoch": 0.5632900332032388, "grad_norm": 0.2608047127723694, "learning_rate": 9.94051912709176e-05, "loss": 0.0122, "step": 9670 }, { "epoch": 0.5638725461641522, "grad_norm": 0.46421870589256287, "learning_rate": 9.940264572699421e-05, "loss": 0.0136, "step": 9680 }, { "epoch": 0.5644550591250656, "grad_norm": 0.35235297679901123, "learning_rate": 9.940009478047174e-05, "loss": 0.0103, "step": 9690 }, { "epoch": 0.5650375720859789, "grad_norm": 0.30639225244522095, "learning_rate": 9.939753843162918e-05, "loss": 0.0106, "step": 9700 }, { "epoch": 0.5656200850468923, "grad_norm": 0.21756641566753387, "learning_rate": 9.939497668074609e-05, "loss": 0.0125, "step": 9710 }, { "epoch": 0.5662025980078057, "grad_norm": 0.2911476492881775, "learning_rate": 9.93924095281026e-05, "loss": 0.012, "step": 9720 }, { "epoch": 0.5667851109687191, "grad_norm": 0.4516535699367523, "learning_rate": 9.938983697397948e-05, "loss": 0.0116, "step": 9730 }, { "epoch": 0.5673676239296325, "grad_norm": 0.255349338054657, "learning_rate": 9.938725901865805e-05, "loss": 0.0109, "step": 9740 }, { "epoch": 0.5679501368905459, "grad_norm": 0.3804728090763092, "learning_rate": 9.93846756624202e-05, "loss": 0.0143, "step": 9750 }, { "epoch": 0.5685326498514592, "grad_norm": 0.29119783639907837, "learning_rate": 9.938208690554849e-05, "loss": 0.0105, "step": 9760 }, { "epoch": 0.5691151628123726, "grad_norm": 0.3434184193611145, "learning_rate": 9.9379492748326e-05, "loss": 0.0101, "step": 9770 }, { "epoch": 0.569697675773286, "grad_norm": 0.30036213994026184, "learning_rate": 9.937689319103641e-05, "loss": 0.0111, "step": 9780 }, { "epoch": 0.5702801887341994, "grad_norm": 0.18254032731056213, "learning_rate": 9.937428823396404e-05, "loss": 0.0125, "step": 9790 }, { "epoch": 0.5708627016951127, "grad_norm": 0.24913308024406433, "learning_rate": 9.937167787739372e-05, "loss": 0.0084, "step": 9800 }, { "epoch": 0.571445214656026, "grad_norm": 0.3342718482017517, "learning_rate": 9.936906212161095e-05, "loss": 0.0131, "step": 9810 }, { "epoch": 0.5720277276169394, "grad_norm": 0.4450584650039673, "learning_rate": 9.936644096690176e-05, "loss": 0.0103, "step": 9820 }, { "epoch": 0.5726102405778528, "grad_norm": 0.2760072946548462, "learning_rate": 9.936381441355282e-05, "loss": 0.016, "step": 9830 }, { "epoch": 0.5731927535387662, "grad_norm": 0.56248939037323, "learning_rate": 9.936118246185136e-05, "loss": 0.0162, "step": 9840 }, { "epoch": 0.5737752664996796, "grad_norm": 0.42612624168395996, "learning_rate": 9.935854511208518e-05, "loss": 0.019, "step": 9850 }, { "epoch": 0.574357779460593, "grad_norm": 0.4512110650539398, "learning_rate": 9.935590236454272e-05, "loss": 0.017, "step": 9860 }, { "epoch": 0.5749402924215063, "grad_norm": 0.3594953715801239, "learning_rate": 9.935325421951298e-05, "loss": 0.0115, "step": 9870 }, { "epoch": 0.5755228053824197, "grad_norm": 0.17981648445129395, "learning_rate": 9.935060067728557e-05, "loss": 0.0122, "step": 9880 }, { "epoch": 0.5761053183433331, "grad_norm": 0.28715601563453674, "learning_rate": 9.934794173815067e-05, "loss": 0.0137, "step": 9890 }, { "epoch": 0.5766878313042465, "grad_norm": 0.3541494607925415, "learning_rate": 9.934527740239906e-05, "loss": 0.012, "step": 9900 }, { "epoch": 0.5772703442651599, "grad_norm": 0.20331743359565735, "learning_rate": 9.934260767032209e-05, "loss": 0.0117, "step": 9910 }, { "epoch": 0.5778528572260733, "grad_norm": 0.32397255301475525, "learning_rate": 9.933993254221172e-05, "loss": 0.0127, "step": 9920 }, { "epoch": 0.5784353701869867, "grad_norm": 0.2809850871562958, "learning_rate": 9.933725201836053e-05, "loss": 0.0085, "step": 9930 }, { "epoch": 0.5790178831479, "grad_norm": 0.27571478486061096, "learning_rate": 9.933456609906162e-05, "loss": 0.014, "step": 9940 }, { "epoch": 0.5796003961088134, "grad_norm": 0.3052954077720642, "learning_rate": 9.933187478460875e-05, "loss": 0.0143, "step": 9950 }, { "epoch": 0.5801829090697268, "grad_norm": 0.42121222615242004, "learning_rate": 9.93291780752962e-05, "loss": 0.0104, "step": 9960 }, { "epoch": 0.5807654220306402, "grad_norm": 0.3771815598011017, "learning_rate": 9.932647597141893e-05, "loss": 0.0117, "step": 9970 }, { "epoch": 0.5813479349915536, "grad_norm": 0.29227063059806824, "learning_rate": 9.932376847327239e-05, "loss": 0.0112, "step": 9980 }, { "epoch": 0.581930447952467, "grad_norm": 0.4178682267665863, "learning_rate": 9.932105558115268e-05, "loss": 0.0137, "step": 9990 }, { "epoch": 0.5825129609133803, "grad_norm": 0.4021865129470825, "learning_rate": 9.931833729535651e-05, "loss": 0.013, "step": 10000 }, { "epoch": 0.5830954738742937, "grad_norm": 0.2939503490924835, "learning_rate": 9.931561361618111e-05, "loss": 0.0114, "step": 10010 }, { "epoch": 0.5836779868352071, "grad_norm": 0.2946171462535858, "learning_rate": 9.931288454392435e-05, "loss": 0.0124, "step": 10020 }, { "epoch": 0.5842604997961205, "grad_norm": 0.2849358320236206, "learning_rate": 9.931015007888467e-05, "loss": 0.0117, "step": 10030 }, { "epoch": 0.5848430127570339, "grad_norm": 0.46997353434562683, "learning_rate": 9.930741022136112e-05, "loss": 0.0142, "step": 10040 }, { "epoch": 0.5854255257179473, "grad_norm": 0.3005836606025696, "learning_rate": 9.930466497165333e-05, "loss": 0.0093, "step": 10050 }, { "epoch": 0.5860080386788606, "grad_norm": 0.2682529091835022, "learning_rate": 9.93019143300615e-05, "loss": 0.0115, "step": 10060 }, { "epoch": 0.586590551639774, "grad_norm": 0.22398750483989716, "learning_rate": 9.929915829688644e-05, "loss": 0.0116, "step": 10070 }, { "epoch": 0.5871730646006874, "grad_norm": 0.16143423318862915, "learning_rate": 9.929639687242955e-05, "loss": 0.0103, "step": 10080 }, { "epoch": 0.5877555775616008, "grad_norm": 0.19682370126247406, "learning_rate": 9.929363005699281e-05, "loss": 0.0097, "step": 10090 }, { "epoch": 0.5883380905225142, "grad_norm": 0.23318275809288025, "learning_rate": 9.92908578508788e-05, "loss": 0.0101, "step": 10100 }, { "epoch": 0.5889206034834275, "grad_norm": 0.3219374120235443, "learning_rate": 9.928808025439069e-05, "loss": 0.011, "step": 10110 }, { "epoch": 0.5895031164443408, "grad_norm": 0.5772382020950317, "learning_rate": 9.928529726783223e-05, "loss": 0.0116, "step": 10120 }, { "epoch": 0.5900856294052542, "grad_norm": 0.2524924576282501, "learning_rate": 9.928250889150774e-05, "loss": 0.0101, "step": 10130 }, { "epoch": 0.5906681423661676, "grad_norm": 0.3121965825557709, "learning_rate": 9.92797151257222e-05, "loss": 0.0106, "step": 10140 }, { "epoch": 0.591250655327081, "grad_norm": 0.21047072112560272, "learning_rate": 9.927691597078108e-05, "loss": 0.0094, "step": 10150 }, { "epoch": 0.5918331682879944, "grad_norm": 0.3174176812171936, "learning_rate": 9.927411142699053e-05, "loss": 0.0103, "step": 10160 }, { "epoch": 0.5924156812489078, "grad_norm": 0.2864536941051483, "learning_rate": 9.927130149465725e-05, "loss": 0.0108, "step": 10170 }, { "epoch": 0.5929981942098211, "grad_norm": 0.28295257687568665, "learning_rate": 9.92684861740885e-05, "loss": 0.0114, "step": 10180 }, { "epoch": 0.5935807071707345, "grad_norm": 0.3985724449157715, "learning_rate": 9.926566546559217e-05, "loss": 0.0113, "step": 10190 }, { "epoch": 0.5941632201316479, "grad_norm": 0.21181337535381317, "learning_rate": 9.926283936947673e-05, "loss": 0.0115, "step": 10200 }, { "epoch": 0.5947457330925613, "grad_norm": 0.3545110821723938, "learning_rate": 9.926000788605126e-05, "loss": 0.012, "step": 10210 }, { "epoch": 0.5953282460534747, "grad_norm": 0.3196934759616852, "learning_rate": 9.92571710156254e-05, "loss": 0.0108, "step": 10220 }, { "epoch": 0.5959107590143881, "grad_norm": 0.33729833364486694, "learning_rate": 9.925432875850936e-05, "loss": 0.0103, "step": 10230 }, { "epoch": 0.5964932719753014, "grad_norm": 0.3225812017917633, "learning_rate": 9.925148111501396e-05, "loss": 0.011, "step": 10240 }, { "epoch": 0.5970757849362148, "grad_norm": 0.42194247245788574, "learning_rate": 9.924862808545066e-05, "loss": 0.0114, "step": 10250 }, { "epoch": 0.5976582978971282, "grad_norm": 0.3312457799911499, "learning_rate": 9.924576967013141e-05, "loss": 0.0124, "step": 10260 }, { "epoch": 0.5982408108580416, "grad_norm": 0.29380330443382263, "learning_rate": 9.924290586936887e-05, "loss": 0.0118, "step": 10270 }, { "epoch": 0.598823323818955, "grad_norm": 0.3894776701927185, "learning_rate": 9.924003668347614e-05, "loss": 0.0184, "step": 10280 }, { "epoch": 0.5994058367798684, "grad_norm": 0.38664546608924866, "learning_rate": 9.923716211276704e-05, "loss": 0.0152, "step": 10290 }, { "epoch": 0.5999883497407817, "grad_norm": 0.1862625777721405, "learning_rate": 9.923428215755594e-05, "loss": 0.0113, "step": 10300 }, { "epoch": 0.6005708627016951, "grad_norm": 0.213981494307518, "learning_rate": 9.923139681815775e-05, "loss": 0.0111, "step": 10310 }, { "epoch": 0.6011533756626085, "grad_norm": 0.2424970418214798, "learning_rate": 9.922850609488801e-05, "loss": 0.0123, "step": 10320 }, { "epoch": 0.6017358886235219, "grad_norm": 0.4181225299835205, "learning_rate": 9.922560998806287e-05, "loss": 0.0098, "step": 10330 }, { "epoch": 0.6023184015844353, "grad_norm": 0.35067424178123474, "learning_rate": 9.922270849799905e-05, "loss": 0.0109, "step": 10340 }, { "epoch": 0.6029009145453487, "grad_norm": 0.30152347683906555, "learning_rate": 9.92198016250138e-05, "loss": 0.0101, "step": 10350 }, { "epoch": 0.603483427506262, "grad_norm": 0.23745670914649963, "learning_rate": 9.921688936942506e-05, "loss": 0.0089, "step": 10360 }, { "epoch": 0.6040659404671754, "grad_norm": 0.4549332559108734, "learning_rate": 9.921397173155129e-05, "loss": 0.012, "step": 10370 }, { "epoch": 0.6046484534280888, "grad_norm": 0.4391336441040039, "learning_rate": 9.921104871171157e-05, "loss": 0.0133, "step": 10380 }, { "epoch": 0.6052309663890022, "grad_norm": 0.37809792160987854, "learning_rate": 9.920812031022554e-05, "loss": 0.012, "step": 10390 }, { "epoch": 0.6058134793499156, "grad_norm": 0.16462819278240204, "learning_rate": 9.920518652741348e-05, "loss": 0.0135, "step": 10400 }, { "epoch": 0.606395992310829, "grad_norm": 0.37245261669158936, "learning_rate": 9.920224736359618e-05, "loss": 0.0127, "step": 10410 }, { "epoch": 0.6069785052717422, "grad_norm": 0.22607602179050446, "learning_rate": 9.91993028190951e-05, "loss": 0.0113, "step": 10420 }, { "epoch": 0.6075610182326556, "grad_norm": 0.3402539789676666, "learning_rate": 9.919635289423222e-05, "loss": 0.0102, "step": 10430 }, { "epoch": 0.608143531193569, "grad_norm": 0.30011066794395447, "learning_rate": 9.919339758933015e-05, "loss": 0.012, "step": 10440 }, { "epoch": 0.6087260441544824, "grad_norm": 0.409231036901474, "learning_rate": 9.919043690471209e-05, "loss": 0.0125, "step": 10450 }, { "epoch": 0.6093085571153958, "grad_norm": 0.3074800372123718, "learning_rate": 9.91874708407018e-05, "loss": 0.0116, "step": 10460 }, { "epoch": 0.6098910700763092, "grad_norm": 0.2519512176513672, "learning_rate": 9.918449939762367e-05, "loss": 0.0121, "step": 10470 }, { "epoch": 0.6104735830372225, "grad_norm": 0.5551772713661194, "learning_rate": 9.91815225758026e-05, "loss": 0.0113, "step": 10480 }, { "epoch": 0.6110560959981359, "grad_norm": 0.2939736843109131, "learning_rate": 9.917854037556419e-05, "loss": 0.012, "step": 10490 }, { "epoch": 0.6116386089590493, "grad_norm": 0.4155401587486267, "learning_rate": 9.917555279723454e-05, "loss": 0.0124, "step": 10500 }, { "epoch": 0.6122211219199627, "grad_norm": 0.30529963970184326, "learning_rate": 9.917255984114036e-05, "loss": 0.0142, "step": 10510 }, { "epoch": 0.6128036348808761, "grad_norm": 0.3827075958251953, "learning_rate": 9.916956150760896e-05, "loss": 0.0124, "step": 10520 }, { "epoch": 0.6133861478417895, "grad_norm": 0.44922056794166565, "learning_rate": 9.916655779696826e-05, "loss": 0.0147, "step": 10530 }, { "epoch": 0.6139686608027028, "grad_norm": 0.3514328598976135, "learning_rate": 9.916354870954671e-05, "loss": 0.0108, "step": 10540 }, { "epoch": 0.6145511737636162, "grad_norm": 0.29682257771492004, "learning_rate": 9.91605342456734e-05, "loss": 0.0099, "step": 10550 }, { "epoch": 0.6151336867245296, "grad_norm": 0.31869587302207947, "learning_rate": 9.915751440567795e-05, "loss": 0.0118, "step": 10560 }, { "epoch": 0.615716199685443, "grad_norm": 0.2644910514354706, "learning_rate": 9.915448918989066e-05, "loss": 0.0123, "step": 10570 }, { "epoch": 0.6162987126463564, "grad_norm": 0.4025249481201172, "learning_rate": 9.915145859864232e-05, "loss": 0.0103, "step": 10580 }, { "epoch": 0.6168812256072698, "grad_norm": 0.29970893263816833, "learning_rate": 9.914842263226437e-05, "loss": 0.0124, "step": 10590 }, { "epoch": 0.6174637385681832, "grad_norm": 0.35187485814094543, "learning_rate": 9.914538129108882e-05, "loss": 0.014, "step": 10600 }, { "epoch": 0.6180462515290965, "grad_norm": 0.5014175772666931, "learning_rate": 9.914233457544825e-05, "loss": 0.0102, "step": 10610 }, { "epoch": 0.6186287644900099, "grad_norm": 0.1783691793680191, "learning_rate": 9.913928248567586e-05, "loss": 0.0112, "step": 10620 }, { "epoch": 0.6192112774509233, "grad_norm": 0.279690146446228, "learning_rate": 9.913622502210542e-05, "loss": 0.0125, "step": 10630 }, { "epoch": 0.6197937904118367, "grad_norm": 0.27393609285354614, "learning_rate": 9.913316218507128e-05, "loss": 0.0096, "step": 10640 }, { "epoch": 0.6203763033727501, "grad_norm": 0.3219526708126068, "learning_rate": 9.91300939749084e-05, "loss": 0.0089, "step": 10650 }, { "epoch": 0.6209588163336635, "grad_norm": 0.33202069997787476, "learning_rate": 9.91270203919523e-05, "loss": 0.0122, "step": 10660 }, { "epoch": 0.6215413292945768, "grad_norm": 0.3636094629764557, "learning_rate": 9.912394143653912e-05, "loss": 0.0089, "step": 10670 }, { "epoch": 0.6221238422554902, "grad_norm": 0.28989094495773315, "learning_rate": 9.912085710900555e-05, "loss": 0.012, "step": 10680 }, { "epoch": 0.6227063552164036, "grad_norm": 0.3201829791069031, "learning_rate": 9.911776740968892e-05, "loss": 0.0087, "step": 10690 }, { "epoch": 0.623288868177317, "grad_norm": 0.2217019498348236, "learning_rate": 9.911467233892709e-05, "loss": 0.0093, "step": 10700 }, { "epoch": 0.6238713811382304, "grad_norm": 0.33159610629081726, "learning_rate": 9.911157189705853e-05, "loss": 0.0135, "step": 10710 }, { "epoch": 0.6244538940991438, "grad_norm": 0.36276859045028687, "learning_rate": 9.910846608442229e-05, "loss": 0.0084, "step": 10720 }, { "epoch": 0.625036407060057, "grad_norm": 0.4524948298931122, "learning_rate": 9.910535490135805e-05, "loss": 0.0093, "step": 10730 }, { "epoch": 0.6256189200209704, "grad_norm": 0.5551922917366028, "learning_rate": 9.910223834820603e-05, "loss": 0.0109, "step": 10740 }, { "epoch": 0.6262014329818838, "grad_norm": 0.45265284180641174, "learning_rate": 9.909911642530703e-05, "loss": 0.0106, "step": 10750 }, { "epoch": 0.6267839459427972, "grad_norm": 0.4039379954338074, "learning_rate": 9.909598913300249e-05, "loss": 0.0122, "step": 10760 }, { "epoch": 0.6273664589037106, "grad_norm": 0.4049648344516754, "learning_rate": 9.909285647163438e-05, "loss": 0.0112, "step": 10770 }, { "epoch": 0.627948971864624, "grad_norm": 0.3931305706501007, "learning_rate": 9.908971844154531e-05, "loss": 0.0114, "step": 10780 }, { "epoch": 0.6285314848255373, "grad_norm": 0.2517894208431244, "learning_rate": 9.908657504307843e-05, "loss": 0.0135, "step": 10790 }, { "epoch": 0.6291139977864507, "grad_norm": 0.24923042953014374, "learning_rate": 9.908342627657751e-05, "loss": 0.0108, "step": 10800 }, { "epoch": 0.6296965107473641, "grad_norm": 0.2767876982688904, "learning_rate": 9.908027214238689e-05, "loss": 0.0111, "step": 10810 }, { "epoch": 0.6302790237082775, "grad_norm": 0.374244749546051, "learning_rate": 9.90771126408515e-05, "loss": 0.0139, "step": 10820 }, { "epoch": 0.6308615366691909, "grad_norm": 0.282764196395874, "learning_rate": 9.907394777231685e-05, "loss": 0.0118, "step": 10830 }, { "epoch": 0.6314440496301043, "grad_norm": 0.36659327149391174, "learning_rate": 9.907077753712905e-05, "loss": 0.0121, "step": 10840 }, { "epoch": 0.6320265625910176, "grad_norm": 0.34629327058792114, "learning_rate": 9.906760193563482e-05, "loss": 0.0122, "step": 10850 }, { "epoch": 0.632609075551931, "grad_norm": 0.25372394919395447, "learning_rate": 9.906442096818139e-05, "loss": 0.0116, "step": 10860 }, { "epoch": 0.6331915885128444, "grad_norm": 0.21794742345809937, "learning_rate": 9.906123463511665e-05, "loss": 0.0096, "step": 10870 }, { "epoch": 0.6337741014737578, "grad_norm": 0.30076029896736145, "learning_rate": 9.905804293678907e-05, "loss": 0.0101, "step": 10880 }, { "epoch": 0.6343566144346712, "grad_norm": 0.3733968138694763, "learning_rate": 9.905484587354766e-05, "loss": 0.0112, "step": 10890 }, { "epoch": 0.6349391273955846, "grad_norm": 0.3687763512134552, "learning_rate": 9.905164344574205e-05, "loss": 0.0113, "step": 10900 }, { "epoch": 0.6355216403564979, "grad_norm": 0.2531723082065582, "learning_rate": 9.904843565372248e-05, "loss": 0.0101, "step": 10910 }, { "epoch": 0.6361041533174113, "grad_norm": 0.18912546336650848, "learning_rate": 9.904522249783972e-05, "loss": 0.0091, "step": 10920 }, { "epoch": 0.6366866662783247, "grad_norm": 0.2543080747127533, "learning_rate": 9.904200397844517e-05, "loss": 0.0116, "step": 10930 }, { "epoch": 0.6372691792392381, "grad_norm": 0.23499220609664917, "learning_rate": 9.903878009589078e-05, "loss": 0.0094, "step": 10940 }, { "epoch": 0.6378516922001515, "grad_norm": 0.2199750542640686, "learning_rate": 9.903555085052915e-05, "loss": 0.0098, "step": 10950 }, { "epoch": 0.6384342051610649, "grad_norm": 0.4099353849887848, "learning_rate": 9.903231624271338e-05, "loss": 0.0093, "step": 10960 }, { "epoch": 0.6390167181219782, "grad_norm": 0.419627845287323, "learning_rate": 9.902907627279724e-05, "loss": 0.01, "step": 10970 }, { "epoch": 0.6395992310828916, "grad_norm": 0.43236517906188965, "learning_rate": 9.902583094113504e-05, "loss": 0.011, "step": 10980 }, { "epoch": 0.640181744043805, "grad_norm": 0.2392715960741043, "learning_rate": 9.902258024808168e-05, "loss": 0.0123, "step": 10990 }, { "epoch": 0.6407642570047184, "grad_norm": 0.3250366747379303, "learning_rate": 9.901932419399264e-05, "loss": 0.0143, "step": 11000 }, { "epoch": 0.6413467699656318, "grad_norm": 0.21828626096248627, "learning_rate": 9.9016062779224e-05, "loss": 0.0103, "step": 11010 }, { "epoch": 0.6419292829265452, "grad_norm": 0.24479924142360687, "learning_rate": 9.901279600413242e-05, "loss": 0.0095, "step": 11020 }, { "epoch": 0.6425117958874585, "grad_norm": 0.43089306354522705, "learning_rate": 9.900952386907518e-05, "loss": 0.0125, "step": 11030 }, { "epoch": 0.6430943088483719, "grad_norm": 0.3417492210865021, "learning_rate": 9.90062463744101e-05, "loss": 0.0088, "step": 11040 }, { "epoch": 0.6436768218092852, "grad_norm": 0.36500176787376404, "learning_rate": 9.900296352049558e-05, "loss": 0.0117, "step": 11050 }, { "epoch": 0.6442593347701986, "grad_norm": 0.2780839800834656, "learning_rate": 9.899967530769065e-05, "loss": 0.0073, "step": 11060 }, { "epoch": 0.644841847731112, "grad_norm": 0.31858375668525696, "learning_rate": 9.899638173635489e-05, "loss": 0.0108, "step": 11070 }, { "epoch": 0.6454243606920254, "grad_norm": 0.33948594331741333, "learning_rate": 9.899308280684849e-05, "loss": 0.0106, "step": 11080 }, { "epoch": 0.6460068736529387, "grad_norm": 0.2951587438583374, "learning_rate": 9.898977851953222e-05, "loss": 0.0108, "step": 11090 }, { "epoch": 0.6465893866138521, "grad_norm": 0.33963990211486816, "learning_rate": 9.898646887476741e-05, "loss": 0.0137, "step": 11100 }, { "epoch": 0.6471718995747655, "grad_norm": 0.2674458622932434, "learning_rate": 9.898315387291603e-05, "loss": 0.0133, "step": 11110 }, { "epoch": 0.6477544125356789, "grad_norm": 0.2317279577255249, "learning_rate": 9.89798335143406e-05, "loss": 0.0115, "step": 11120 }, { "epoch": 0.6483369254965923, "grad_norm": 0.43236392736434937, "learning_rate": 9.897650779940419e-05, "loss": 0.0111, "step": 11130 }, { "epoch": 0.6489194384575057, "grad_norm": 0.37345650792121887, "learning_rate": 9.897317672847054e-05, "loss": 0.0155, "step": 11140 }, { "epoch": 0.649501951418419, "grad_norm": 0.2641894519329071, "learning_rate": 9.89698403019039e-05, "loss": 0.0162, "step": 11150 }, { "epoch": 0.6500844643793324, "grad_norm": 0.23331056535243988, "learning_rate": 9.896649852006917e-05, "loss": 0.0138, "step": 11160 }, { "epoch": 0.6506669773402458, "grad_norm": 0.4068915843963623, "learning_rate": 9.896315138333177e-05, "loss": 0.013, "step": 11170 }, { "epoch": 0.6512494903011592, "grad_norm": 0.31883156299591064, "learning_rate": 9.895979889205774e-05, "loss": 0.0115, "step": 11180 }, { "epoch": 0.6518320032620726, "grad_norm": 0.3925724923610687, "learning_rate": 9.895644104661372e-05, "loss": 0.0177, "step": 11190 }, { "epoch": 0.652414516222986, "grad_norm": 0.4251573383808136, "learning_rate": 9.895307784736691e-05, "loss": 0.0155, "step": 11200 }, { "epoch": 0.6529970291838993, "grad_norm": 0.39757588505744934, "learning_rate": 9.894970929468512e-05, "loss": 0.0145, "step": 11210 }, { "epoch": 0.6535795421448127, "grad_norm": 0.3408699333667755, "learning_rate": 9.89463353889367e-05, "loss": 0.0123, "step": 11220 }, { "epoch": 0.6541620551057261, "grad_norm": 0.43814817070961, "learning_rate": 9.894295613049065e-05, "loss": 0.0125, "step": 11230 }, { "epoch": 0.6547445680666395, "grad_norm": 0.24500298500061035, "learning_rate": 9.893957151971649e-05, "loss": 0.0104, "step": 11240 }, { "epoch": 0.6553270810275529, "grad_norm": 0.29698458313941956, "learning_rate": 9.893618155698436e-05, "loss": 0.019, "step": 11250 }, { "epoch": 0.6559095939884663, "grad_norm": 0.33224183320999146, "learning_rate": 9.8932786242665e-05, "loss": 0.0103, "step": 11260 }, { "epoch": 0.6564921069493797, "grad_norm": 0.3274209797382355, "learning_rate": 9.89293855771297e-05, "loss": 0.0103, "step": 11270 }, { "epoch": 0.657074619910293, "grad_norm": 0.25113263726234436, "learning_rate": 9.892597956075036e-05, "loss": 0.0092, "step": 11280 }, { "epoch": 0.6576571328712064, "grad_norm": 0.33727148175239563, "learning_rate": 9.892256819389947e-05, "loss": 0.0134, "step": 11290 }, { "epoch": 0.6582396458321198, "grad_norm": 0.36965233087539673, "learning_rate": 9.891915147695006e-05, "loss": 0.0123, "step": 11300 }, { "epoch": 0.6588221587930332, "grad_norm": 0.24199701845645905, "learning_rate": 9.891572941027577e-05, "loss": 0.0125, "step": 11310 }, { "epoch": 0.6594046717539466, "grad_norm": 0.22798088192939758, "learning_rate": 9.89123019942509e-05, "loss": 0.0091, "step": 11320 }, { "epoch": 0.65998718471486, "grad_norm": 0.30673107504844666, "learning_rate": 9.89088692292502e-05, "loss": 0.0116, "step": 11330 }, { "epoch": 0.6605696976757733, "grad_norm": 0.41818109154701233, "learning_rate": 9.89054311156491e-05, "loss": 0.0153, "step": 11340 }, { "epoch": 0.6611522106366867, "grad_norm": 0.2872347831726074, "learning_rate": 9.890198765382357e-05, "loss": 0.0103, "step": 11350 }, { "epoch": 0.6617347235976, "grad_norm": 0.20405597984790802, "learning_rate": 9.889853884415021e-05, "loss": 0.0097, "step": 11360 }, { "epoch": 0.6623172365585134, "grad_norm": 0.18058334290981293, "learning_rate": 9.889508468700614e-05, "loss": 0.0154, "step": 11370 }, { "epoch": 0.6628997495194268, "grad_norm": 0.16348174214363098, "learning_rate": 9.889162518276915e-05, "loss": 0.0095, "step": 11380 }, { "epoch": 0.6634822624803401, "grad_norm": 0.20774662494659424, "learning_rate": 9.888816033181752e-05, "loss": 0.0091, "step": 11390 }, { "epoch": 0.6640647754412535, "grad_norm": 0.27124476432800293, "learning_rate": 9.888469013453018e-05, "loss": 0.0093, "step": 11400 }, { "epoch": 0.6646472884021669, "grad_norm": 0.2653883993625641, "learning_rate": 9.888121459128663e-05, "loss": 0.0112, "step": 11410 }, { "epoch": 0.6652298013630803, "grad_norm": 0.32491305470466614, "learning_rate": 9.887773370246693e-05, "loss": 0.011, "step": 11420 }, { "epoch": 0.6658123143239937, "grad_norm": 0.22934333980083466, "learning_rate": 9.887424746845177e-05, "loss": 0.0113, "step": 11430 }, { "epoch": 0.6663948272849071, "grad_norm": 0.30383703112602234, "learning_rate": 9.887075588962239e-05, "loss": 0.0094, "step": 11440 }, { "epoch": 0.6669773402458204, "grad_norm": 0.31817537546157837, "learning_rate": 9.88672589663606e-05, "loss": 0.0121, "step": 11450 }, { "epoch": 0.6675598532067338, "grad_norm": 0.32090988755226135, "learning_rate": 9.886375669904886e-05, "loss": 0.0159, "step": 11460 }, { "epoch": 0.6681423661676472, "grad_norm": 0.21318046748638153, "learning_rate": 9.886024908807014e-05, "loss": 0.0129, "step": 11470 }, { "epoch": 0.6687248791285606, "grad_norm": 0.23719626665115356, "learning_rate": 9.885673613380806e-05, "loss": 0.0108, "step": 11480 }, { "epoch": 0.669307392089474, "grad_norm": 0.27283975481987, "learning_rate": 9.885321783664676e-05, "loss": 0.0091, "step": 11490 }, { "epoch": 0.6698899050503874, "grad_norm": 0.3661755621433258, "learning_rate": 9.884969419697101e-05, "loss": 0.0126, "step": 11500 }, { "epoch": 0.6704724180113008, "grad_norm": 0.22555513679981232, "learning_rate": 9.884616521516614e-05, "loss": 0.0094, "step": 11510 }, { "epoch": 0.6710549309722141, "grad_norm": 0.22565816342830658, "learning_rate": 9.88426308916181e-05, "loss": 0.0117, "step": 11520 }, { "epoch": 0.6716374439331275, "grad_norm": 0.4424149692058563, "learning_rate": 9.883909122671335e-05, "loss": 0.012, "step": 11530 }, { "epoch": 0.6722199568940409, "grad_norm": 0.24231690168380737, "learning_rate": 9.883554622083904e-05, "loss": 0.0104, "step": 11540 }, { "epoch": 0.6728024698549543, "grad_norm": 0.2792891263961792, "learning_rate": 9.88319958743828e-05, "loss": 0.0112, "step": 11550 }, { "epoch": 0.6733849828158677, "grad_norm": 0.3010520935058594, "learning_rate": 9.882844018773291e-05, "loss": 0.0106, "step": 11560 }, { "epoch": 0.6739674957767811, "grad_norm": 0.21273921430110931, "learning_rate": 9.882487916127823e-05, "loss": 0.0107, "step": 11570 }, { "epoch": 0.6745500087376944, "grad_norm": 0.4418739080429077, "learning_rate": 9.882131279540815e-05, "loss": 0.0103, "step": 11580 }, { "epoch": 0.6751325216986078, "grad_norm": 0.3295326828956604, "learning_rate": 9.881774109051271e-05, "loss": 0.0119, "step": 11590 }, { "epoch": 0.6757150346595212, "grad_norm": 0.25638407468795776, "learning_rate": 9.881416404698252e-05, "loss": 0.0111, "step": 11600 }, { "epoch": 0.6762975476204346, "grad_norm": 0.2892792224884033, "learning_rate": 9.881058166520873e-05, "loss": 0.0107, "step": 11610 }, { "epoch": 0.676880060581348, "grad_norm": 0.33731546998023987, "learning_rate": 9.880699394558311e-05, "loss": 0.0128, "step": 11620 }, { "epoch": 0.6774625735422614, "grad_norm": 0.23143166303634644, "learning_rate": 9.880340088849801e-05, "loss": 0.0092, "step": 11630 }, { "epoch": 0.6780450865031747, "grad_norm": 0.22860778868198395, "learning_rate": 9.879980249434637e-05, "loss": 0.0129, "step": 11640 }, { "epoch": 0.6786275994640881, "grad_norm": 0.3056270182132721, "learning_rate": 9.879619876352168e-05, "loss": 0.0095, "step": 11650 }, { "epoch": 0.6792101124250015, "grad_norm": 0.4038679599761963, "learning_rate": 9.879258969641809e-05, "loss": 0.0111, "step": 11660 }, { "epoch": 0.6797926253859148, "grad_norm": 0.1861642599105835, "learning_rate": 9.878897529343023e-05, "loss": 0.0112, "step": 11670 }, { "epoch": 0.6803751383468282, "grad_norm": 0.27775126695632935, "learning_rate": 9.878535555495338e-05, "loss": 0.0091, "step": 11680 }, { "epoch": 0.6809576513077416, "grad_norm": 0.32199418544769287, "learning_rate": 9.87817304813834e-05, "loss": 0.0107, "step": 11690 }, { "epoch": 0.6815401642686549, "grad_norm": 0.29690802097320557, "learning_rate": 9.877810007311671e-05, "loss": 0.0092, "step": 11700 }, { "epoch": 0.6821226772295683, "grad_norm": 0.24447891116142273, "learning_rate": 9.877446433055035e-05, "loss": 0.0127, "step": 11710 }, { "epoch": 0.6827051901904817, "grad_norm": 0.1703350692987442, "learning_rate": 9.877082325408191e-05, "loss": 0.0093, "step": 11720 }, { "epoch": 0.6832877031513951, "grad_norm": 0.3383316695690155, "learning_rate": 9.876717684410954e-05, "loss": 0.0106, "step": 11730 }, { "epoch": 0.6838702161123085, "grad_norm": 0.305230975151062, "learning_rate": 9.876352510103204e-05, "loss": 0.0089, "step": 11740 }, { "epoch": 0.6844527290732219, "grad_norm": 0.14697661995887756, "learning_rate": 9.875986802524875e-05, "loss": 0.0092, "step": 11750 }, { "epoch": 0.6850352420341352, "grad_norm": 0.28472980856895447, "learning_rate": 9.87562056171596e-05, "loss": 0.0126, "step": 11760 }, { "epoch": 0.6856177549950486, "grad_norm": 0.29250383377075195, "learning_rate": 9.875253787716511e-05, "loss": 0.0078, "step": 11770 }, { "epoch": 0.686200267955962, "grad_norm": 0.38419732451438904, "learning_rate": 9.874886480566637e-05, "loss": 0.009, "step": 11780 }, { "epoch": 0.6867827809168754, "grad_norm": 0.32346001267433167, "learning_rate": 9.874518640306507e-05, "loss": 0.0106, "step": 11790 }, { "epoch": 0.6873652938777888, "grad_norm": 0.23227253556251526, "learning_rate": 9.874150266976347e-05, "loss": 0.0119, "step": 11800 }, { "epoch": 0.6879478068387022, "grad_norm": 0.2185230255126953, "learning_rate": 9.873781360616443e-05, "loss": 0.01, "step": 11810 }, { "epoch": 0.6885303197996155, "grad_norm": 0.33769890666007996, "learning_rate": 9.873411921267137e-05, "loss": 0.012, "step": 11820 }, { "epoch": 0.6891128327605289, "grad_norm": 0.31108415126800537, "learning_rate": 9.873041948968829e-05, "loss": 0.0105, "step": 11830 }, { "epoch": 0.6896953457214423, "grad_norm": 0.2843596041202545, "learning_rate": 9.872671443761981e-05, "loss": 0.01, "step": 11840 }, { "epoch": 0.6902778586823557, "grad_norm": 0.29677069187164307, "learning_rate": 9.872300405687109e-05, "loss": 0.0128, "step": 11850 }, { "epoch": 0.6908603716432691, "grad_norm": 0.2883051633834839, "learning_rate": 9.871928834784792e-05, "loss": 0.0137, "step": 11860 }, { "epoch": 0.6914428846041825, "grad_norm": 0.20777954161167145, "learning_rate": 9.871556731095661e-05, "loss": 0.0144, "step": 11870 }, { "epoch": 0.6920253975650958, "grad_norm": 0.26724910736083984, "learning_rate": 9.871184094660411e-05, "loss": 0.0115, "step": 11880 }, { "epoch": 0.6926079105260092, "grad_norm": 0.46307632327079773, "learning_rate": 9.870810925519791e-05, "loss": 0.0133, "step": 11890 }, { "epoch": 0.6931904234869226, "grad_norm": 0.28797000646591187, "learning_rate": 9.870437223714612e-05, "loss": 0.0144, "step": 11900 }, { "epoch": 0.693772936447836, "grad_norm": 0.15057224035263062, "learning_rate": 9.87006298928574e-05, "loss": 0.011, "step": 11910 }, { "epoch": 0.6943554494087494, "grad_norm": 0.30991995334625244, "learning_rate": 9.869688222274103e-05, "loss": 0.0096, "step": 11920 }, { "epoch": 0.6949379623696628, "grad_norm": 0.2644386887550354, "learning_rate": 9.869312922720681e-05, "loss": 0.0084, "step": 11930 }, { "epoch": 0.6955204753305761, "grad_norm": 0.38433077931404114, "learning_rate": 9.868937090666521e-05, "loss": 0.0125, "step": 11940 }, { "epoch": 0.6961029882914895, "grad_norm": 0.19309815764427185, "learning_rate": 9.86856072615272e-05, "loss": 0.0089, "step": 11950 }, { "epoch": 0.6966855012524029, "grad_norm": 0.1911529153585434, "learning_rate": 9.868183829220438e-05, "loss": 0.011, "step": 11960 }, { "epoch": 0.6972680142133163, "grad_norm": 0.28755906224250793, "learning_rate": 9.867806399910893e-05, "loss": 0.0117, "step": 11970 }, { "epoch": 0.6978505271742296, "grad_norm": 0.28259196877479553, "learning_rate": 9.867428438265356e-05, "loss": 0.0092, "step": 11980 }, { "epoch": 0.698433040135143, "grad_norm": 0.2533143162727356, "learning_rate": 9.867049944325165e-05, "loss": 0.0091, "step": 11990 }, { "epoch": 0.6990155530960563, "grad_norm": 0.18673262000083923, "learning_rate": 9.86667091813171e-05, "loss": 0.0089, "step": 12000 }, { "epoch": 0.6995980660569697, "grad_norm": 0.3116036355495453, "learning_rate": 9.866291359726438e-05, "loss": 0.0122, "step": 12010 }, { "epoch": 0.7001805790178831, "grad_norm": 0.21451440453529358, "learning_rate": 9.865911269150861e-05, "loss": 0.0129, "step": 12020 }, { "epoch": 0.7007630919787965, "grad_norm": 0.39065560698509216, "learning_rate": 9.865530646446544e-05, "loss": 0.0105, "step": 12030 }, { "epoch": 0.7013456049397099, "grad_norm": 0.24504049122333527, "learning_rate": 9.86514949165511e-05, "loss": 0.0093, "step": 12040 }, { "epoch": 0.7019281179006233, "grad_norm": 0.2972680628299713, "learning_rate": 9.864767804818243e-05, "loss": 0.0114, "step": 12050 }, { "epoch": 0.7025106308615366, "grad_norm": 0.5118283629417419, "learning_rate": 9.86438558597768e-05, "loss": 0.0131, "step": 12060 }, { "epoch": 0.70309314382245, "grad_norm": 0.24782323837280273, "learning_rate": 9.864002835175225e-05, "loss": 0.0108, "step": 12070 }, { "epoch": 0.7036756567833634, "grad_norm": 0.2735443115234375, "learning_rate": 9.863619552452734e-05, "loss": 0.0112, "step": 12080 }, { "epoch": 0.7042581697442768, "grad_norm": 0.35497885942459106, "learning_rate": 9.863235737852119e-05, "loss": 0.0091, "step": 12090 }, { "epoch": 0.7048406827051902, "grad_norm": 0.27954673767089844, "learning_rate": 9.862851391415356e-05, "loss": 0.0118, "step": 12100 }, { "epoch": 0.7054231956661036, "grad_norm": 0.25629347562789917, "learning_rate": 9.862466513184477e-05, "loss": 0.0128, "step": 12110 }, { "epoch": 0.706005708627017, "grad_norm": 0.2656727135181427, "learning_rate": 9.86208110320157e-05, "loss": 0.0122, "step": 12120 }, { "epoch": 0.7065882215879303, "grad_norm": 0.1817159652709961, "learning_rate": 9.861695161508784e-05, "loss": 0.0095, "step": 12130 }, { "epoch": 0.7071707345488437, "grad_norm": 0.3757285475730896, "learning_rate": 9.861308688148324e-05, "loss": 0.0095, "step": 12140 }, { "epoch": 0.7077532475097571, "grad_norm": 0.41108325123786926, "learning_rate": 9.860921683162455e-05, "loss": 0.0135, "step": 12150 }, { "epoch": 0.7083357604706705, "grad_norm": 0.4094219505786896, "learning_rate": 9.860534146593499e-05, "loss": 0.0188, "step": 12160 }, { "epoch": 0.7089182734315839, "grad_norm": 0.22639647126197815, "learning_rate": 9.860146078483836e-05, "loss": 0.0111, "step": 12170 }, { "epoch": 0.7095007863924973, "grad_norm": 0.38253772258758545, "learning_rate": 9.859757478875905e-05, "loss": 0.0138, "step": 12180 }, { "epoch": 0.7100832993534106, "grad_norm": 0.33241885900497437, "learning_rate": 9.859368347812204e-05, "loss": 0.0132, "step": 12190 }, { "epoch": 0.710665812314324, "grad_norm": 0.32325369119644165, "learning_rate": 9.858978685335285e-05, "loss": 0.0091, "step": 12200 }, { "epoch": 0.7112483252752374, "grad_norm": 0.33348292112350464, "learning_rate": 9.858588491487763e-05, "loss": 0.0128, "step": 12210 }, { "epoch": 0.7118308382361508, "grad_norm": 0.16235622763633728, "learning_rate": 9.858197766312308e-05, "loss": 0.0136, "step": 12220 }, { "epoch": 0.7124133511970642, "grad_norm": 0.4801989793777466, "learning_rate": 9.857806509851649e-05, "loss": 0.0157, "step": 12230 }, { "epoch": 0.7129958641579776, "grad_norm": 0.25309666991233826, "learning_rate": 9.857414722148574e-05, "loss": 0.0103, "step": 12240 }, { "epoch": 0.7135783771188909, "grad_norm": 0.2188948094844818, "learning_rate": 9.857022403245928e-05, "loss": 0.0102, "step": 12250 }, { "epoch": 0.7141608900798043, "grad_norm": 0.27165719866752625, "learning_rate": 9.856629553186615e-05, "loss": 0.0106, "step": 12260 }, { "epoch": 0.7147434030407177, "grad_norm": 0.4029421806335449, "learning_rate": 9.856236172013595e-05, "loss": 0.0128, "step": 12270 }, { "epoch": 0.7153259160016311, "grad_norm": 0.33737045526504517, "learning_rate": 9.85584225976989e-05, "loss": 0.0117, "step": 12280 }, { "epoch": 0.7159084289625444, "grad_norm": 0.3688197433948517, "learning_rate": 9.855447816498575e-05, "loss": 0.0107, "step": 12290 }, { "epoch": 0.7164909419234577, "grad_norm": 0.23958411812782288, "learning_rate": 9.855052842242787e-05, "loss": 0.0087, "step": 12300 }, { "epoch": 0.7170734548843711, "grad_norm": 0.2046879231929779, "learning_rate": 9.85465733704572e-05, "loss": 0.01, "step": 12310 }, { "epoch": 0.7176559678452845, "grad_norm": 0.2616514265537262, "learning_rate": 9.854261300950624e-05, "loss": 0.0095, "step": 12320 }, { "epoch": 0.7182384808061979, "grad_norm": 0.3517744839191437, "learning_rate": 9.853864734000813e-05, "loss": 0.0095, "step": 12330 }, { "epoch": 0.7188209937671113, "grad_norm": 0.2378864586353302, "learning_rate": 9.85346763623965e-05, "loss": 0.0109, "step": 12340 }, { "epoch": 0.7194035067280247, "grad_norm": 0.18699803948402405, "learning_rate": 9.853070007710564e-05, "loss": 0.0101, "step": 12350 }, { "epoch": 0.719986019688938, "grad_norm": 0.28614553809165955, "learning_rate": 9.85267184845704e-05, "loss": 0.0086, "step": 12360 }, { "epoch": 0.7205685326498514, "grad_norm": 0.1522756814956665, "learning_rate": 9.852273158522616e-05, "loss": 0.0075, "step": 12370 }, { "epoch": 0.7211510456107648, "grad_norm": 0.3504115641117096, "learning_rate": 9.851873937950896e-05, "loss": 0.0127, "step": 12380 }, { "epoch": 0.7217335585716782, "grad_norm": 0.23649334907531738, "learning_rate": 9.851474186785537e-05, "loss": 0.0112, "step": 12390 }, { "epoch": 0.7223160715325916, "grad_norm": 0.309222936630249, "learning_rate": 9.851073905070254e-05, "loss": 0.0127, "step": 12400 }, { "epoch": 0.722898584493505, "grad_norm": 0.37841108441352844, "learning_rate": 9.850673092848824e-05, "loss": 0.012, "step": 12410 }, { "epoch": 0.7234810974544184, "grad_norm": 0.2518141269683838, "learning_rate": 9.850271750165077e-05, "loss": 0.0119, "step": 12420 }, { "epoch": 0.7240636104153317, "grad_norm": 0.2948949635028839, "learning_rate": 9.849869877062902e-05, "loss": 0.0108, "step": 12430 }, { "epoch": 0.7246461233762451, "grad_norm": 0.2947480380535126, "learning_rate": 9.849467473586252e-05, "loss": 0.0099, "step": 12440 }, { "epoch": 0.7252286363371585, "grad_norm": 0.32007908821105957, "learning_rate": 9.849064539779127e-05, "loss": 0.0088, "step": 12450 }, { "epoch": 0.7258111492980719, "grad_norm": 0.2762373387813568, "learning_rate": 9.848661075685594e-05, "loss": 0.0093, "step": 12460 }, { "epoch": 0.7263936622589853, "grad_norm": 0.2780131995677948, "learning_rate": 9.848257081349778e-05, "loss": 0.0084, "step": 12470 }, { "epoch": 0.7269761752198987, "grad_norm": 0.23851686716079712, "learning_rate": 9.847852556815856e-05, "loss": 0.0109, "step": 12480 }, { "epoch": 0.727558688180812, "grad_norm": 0.2364404797554016, "learning_rate": 9.847447502128067e-05, "loss": 0.0089, "step": 12490 }, { "epoch": 0.7281412011417254, "grad_norm": 0.36279597878456116, "learning_rate": 9.847041917330708e-05, "loss": 0.0107, "step": 12500 }, { "epoch": 0.7287237141026388, "grad_norm": 0.27778196334838867, "learning_rate": 9.846635802468132e-05, "loss": 0.0083, "step": 12510 }, { "epoch": 0.7293062270635522, "grad_norm": 0.303584486246109, "learning_rate": 9.84622915758475e-05, "loss": 0.0097, "step": 12520 }, { "epoch": 0.7298887400244656, "grad_norm": 0.26612570881843567, "learning_rate": 9.845821982725034e-05, "loss": 0.0088, "step": 12530 }, { "epoch": 0.730471252985379, "grad_norm": 0.43369412422180176, "learning_rate": 9.845414277933514e-05, "loss": 0.0085, "step": 12540 }, { "epoch": 0.7310537659462923, "grad_norm": 0.2940000891685486, "learning_rate": 9.845006043254771e-05, "loss": 0.0116, "step": 12550 }, { "epoch": 0.7316362789072057, "grad_norm": 0.2642724812030792, "learning_rate": 9.844597278733451e-05, "loss": 0.0082, "step": 12560 }, { "epoch": 0.7322187918681191, "grad_norm": 0.34561997652053833, "learning_rate": 9.844187984414259e-05, "loss": 0.0113, "step": 12570 }, { "epoch": 0.7328013048290325, "grad_norm": 0.3849681317806244, "learning_rate": 9.84377816034195e-05, "loss": 0.0102, "step": 12580 }, { "epoch": 0.7333838177899459, "grad_norm": 0.14842355251312256, "learning_rate": 9.843367806561345e-05, "loss": 0.0092, "step": 12590 }, { "epoch": 0.7339663307508592, "grad_norm": 0.32838743925094604, "learning_rate": 9.842956923117317e-05, "loss": 0.0104, "step": 12600 }, { "epoch": 0.7345488437117725, "grad_norm": 0.2051447331905365, "learning_rate": 9.842545510054802e-05, "loss": 0.0081, "step": 12610 }, { "epoch": 0.7351313566726859, "grad_norm": 0.2681458592414856, "learning_rate": 9.842133567418792e-05, "loss": 0.0096, "step": 12620 }, { "epoch": 0.7357138696335993, "grad_norm": 0.1889035701751709, "learning_rate": 9.841721095254333e-05, "loss": 0.0111, "step": 12630 }, { "epoch": 0.7362963825945127, "grad_norm": 0.23692217469215393, "learning_rate": 9.841308093606537e-05, "loss": 0.0098, "step": 12640 }, { "epoch": 0.7368788955554261, "grad_norm": 0.24962086975574493, "learning_rate": 9.840894562520565e-05, "loss": 0.0099, "step": 12650 }, { "epoch": 0.7374614085163395, "grad_norm": 0.5094000697135925, "learning_rate": 9.840480502041642e-05, "loss": 0.0119, "step": 12660 }, { "epoch": 0.7380439214772528, "grad_norm": 0.3661715090274811, "learning_rate": 9.840065912215049e-05, "loss": 0.0136, "step": 12670 }, { "epoch": 0.7386264344381662, "grad_norm": 0.2870263159275055, "learning_rate": 9.839650793086124e-05, "loss": 0.0096, "step": 12680 }, { "epoch": 0.7392089473990796, "grad_norm": 0.370992511510849, "learning_rate": 9.839235144700265e-05, "loss": 0.0152, "step": 12690 }, { "epoch": 0.739791460359993, "grad_norm": 0.22844703495502472, "learning_rate": 9.838818967102926e-05, "loss": 0.0123, "step": 12700 }, { "epoch": 0.7403739733209064, "grad_norm": 0.2773178219795227, "learning_rate": 9.83840226033962e-05, "loss": 0.0081, "step": 12710 }, { "epoch": 0.7409564862818198, "grad_norm": 0.33924540877342224, "learning_rate": 9.837985024455918e-05, "loss": 0.0107, "step": 12720 }, { "epoch": 0.7415389992427331, "grad_norm": 0.4531949460506439, "learning_rate": 9.837567259497447e-05, "loss": 0.0113, "step": 12730 }, { "epoch": 0.7421215122036465, "grad_norm": 0.2751784920692444, "learning_rate": 9.837148965509894e-05, "loss": 0.0096, "step": 12740 }, { "epoch": 0.7427040251645599, "grad_norm": 0.26049160957336426, "learning_rate": 9.836730142539001e-05, "loss": 0.0091, "step": 12750 }, { "epoch": 0.7432865381254733, "grad_norm": 0.17433734238147736, "learning_rate": 9.836310790630574e-05, "loss": 0.0099, "step": 12760 }, { "epoch": 0.7438690510863867, "grad_norm": 0.21776194870471954, "learning_rate": 9.83589090983047e-05, "loss": 0.0087, "step": 12770 }, { "epoch": 0.7444515640473001, "grad_norm": 0.31042933464050293, "learning_rate": 9.835470500184605e-05, "loss": 0.0155, "step": 12780 }, { "epoch": 0.7450340770082134, "grad_norm": 0.5370248556137085, "learning_rate": 9.835049561738957e-05, "loss": 0.0139, "step": 12790 }, { "epoch": 0.7456165899691268, "grad_norm": 0.4677162766456604, "learning_rate": 9.834628094539558e-05, "loss": 0.0124, "step": 12800 }, { "epoch": 0.7461991029300402, "grad_norm": 0.346724271774292, "learning_rate": 9.834206098632499e-05, "loss": 0.0129, "step": 12810 }, { "epoch": 0.7467816158909536, "grad_norm": 0.2665993273258209, "learning_rate": 9.833783574063931e-05, "loss": 0.0113, "step": 12820 }, { "epoch": 0.747364128851867, "grad_norm": 0.27550584077835083, "learning_rate": 9.833360520880058e-05, "loss": 0.0115, "step": 12830 }, { "epoch": 0.7479466418127804, "grad_norm": 0.36114075779914856, "learning_rate": 9.832936939127144e-05, "loss": 0.013, "step": 12840 }, { "epoch": 0.7485291547736938, "grad_norm": 0.3218906819820404, "learning_rate": 9.832512828851515e-05, "loss": 0.0097, "step": 12850 }, { "epoch": 0.7491116677346071, "grad_norm": 0.21231773495674133, "learning_rate": 9.832088190099546e-05, "loss": 0.0098, "step": 12860 }, { "epoch": 0.7496941806955205, "grad_norm": 0.4011453092098236, "learning_rate": 9.831663022917679e-05, "loss": 0.0111, "step": 12870 }, { "epoch": 0.7502766936564339, "grad_norm": 0.2760467827320099, "learning_rate": 9.831237327352407e-05, "loss": 0.0104, "step": 12880 }, { "epoch": 0.7508592066173473, "grad_norm": 0.1996665596961975, "learning_rate": 9.830811103450286e-05, "loss": 0.0095, "step": 12890 }, { "epoch": 0.7514417195782607, "grad_norm": 0.24047091603279114, "learning_rate": 9.830384351257924e-05, "loss": 0.008, "step": 12900 }, { "epoch": 0.7520242325391739, "grad_norm": 0.21825580298900604, "learning_rate": 9.829957070821993e-05, "loss": 0.0086, "step": 12910 }, { "epoch": 0.7526067455000873, "grad_norm": 0.34336215257644653, "learning_rate": 9.829529262189218e-05, "loss": 0.0109, "step": 12920 }, { "epoch": 0.7531892584610007, "grad_norm": 0.3204598128795624, "learning_rate": 9.829100925406385e-05, "loss": 0.0101, "step": 12930 }, { "epoch": 0.7537717714219141, "grad_norm": 0.29522451758384705, "learning_rate": 9.828672060520333e-05, "loss": 0.0066, "step": 12940 }, { "epoch": 0.7543542843828275, "grad_norm": 0.25413066148757935, "learning_rate": 9.828242667577966e-05, "loss": 0.0104, "step": 12950 }, { "epoch": 0.7549367973437409, "grad_norm": 0.3678099513053894, "learning_rate": 9.82781274662624e-05, "loss": 0.0106, "step": 12960 }, { "epoch": 0.7555193103046542, "grad_norm": 0.25887787342071533, "learning_rate": 9.82738229771217e-05, "loss": 0.0109, "step": 12970 }, { "epoch": 0.7561018232655676, "grad_norm": 0.5047935247421265, "learning_rate": 9.826951320882829e-05, "loss": 0.013, "step": 12980 }, { "epoch": 0.756684336226481, "grad_norm": 0.3073876202106476, "learning_rate": 9.826519816185351e-05, "loss": 0.0122, "step": 12990 }, { "epoch": 0.7572668491873944, "grad_norm": 0.38171812891960144, "learning_rate": 9.826087783666921e-05, "loss": 0.0117, "step": 13000 }, { "epoch": 0.7578493621483078, "grad_norm": 0.2137841433286667, "learning_rate": 9.825655223374787e-05, "loss": 0.0124, "step": 13010 }, { "epoch": 0.7584318751092212, "grad_norm": 0.313880056142807, "learning_rate": 9.825222135356253e-05, "loss": 0.0102, "step": 13020 }, { "epoch": 0.7590143880701345, "grad_norm": 0.2235432267189026, "learning_rate": 9.82478851965868e-05, "loss": 0.0105, "step": 13030 }, { "epoch": 0.7595969010310479, "grad_norm": 0.3323928117752075, "learning_rate": 9.82435437632949e-05, "loss": 0.0097, "step": 13040 }, { "epoch": 0.7601794139919613, "grad_norm": 0.2440030574798584, "learning_rate": 9.823919705416158e-05, "loss": 0.0106, "step": 13050 }, { "epoch": 0.7607619269528747, "grad_norm": 0.19985781610012054, "learning_rate": 9.82348450696622e-05, "loss": 0.0105, "step": 13060 }, { "epoch": 0.7613444399137881, "grad_norm": 0.27806660532951355, "learning_rate": 9.823048781027268e-05, "loss": 0.0069, "step": 13070 }, { "epoch": 0.7619269528747015, "grad_norm": 0.3436034619808197, "learning_rate": 9.822612527646953e-05, "loss": 0.0087, "step": 13080 }, { "epoch": 0.7625094658356149, "grad_norm": 0.3130769431591034, "learning_rate": 9.822175746872984e-05, "loss": 0.0116, "step": 13090 }, { "epoch": 0.7630919787965282, "grad_norm": 0.5062789916992188, "learning_rate": 9.821738438753123e-05, "loss": 0.0114, "step": 13100 }, { "epoch": 0.7636744917574416, "grad_norm": 0.35312896966934204, "learning_rate": 9.821300603335196e-05, "loss": 0.0117, "step": 13110 }, { "epoch": 0.764257004718355, "grad_norm": 0.4013625979423523, "learning_rate": 9.820862240667085e-05, "loss": 0.0122, "step": 13120 }, { "epoch": 0.7648395176792684, "grad_norm": 0.2546074688434601, "learning_rate": 9.820423350796726e-05, "loss": 0.0141, "step": 13130 }, { "epoch": 0.7654220306401818, "grad_norm": 0.26726576685905457, "learning_rate": 9.819983933772118e-05, "loss": 0.0093, "step": 13140 }, { "epoch": 0.7660045436010952, "grad_norm": 0.21654672920703888, "learning_rate": 9.819543989641314e-05, "loss": 0.0093, "step": 13150 }, { "epoch": 0.7665870565620085, "grad_norm": 0.20355702936649323, "learning_rate": 9.819103518452423e-05, "loss": 0.0083, "step": 13160 }, { "epoch": 0.7671695695229219, "grad_norm": 0.17116662859916687, "learning_rate": 9.818662520253618e-05, "loss": 0.0119, "step": 13170 }, { "epoch": 0.7677520824838353, "grad_norm": 0.18393750488758087, "learning_rate": 9.818220995093126e-05, "loss": 0.0089, "step": 13180 }, { "epoch": 0.7683345954447487, "grad_norm": 0.28204232454299927, "learning_rate": 9.817778943019228e-05, "loss": 0.0088, "step": 13190 }, { "epoch": 0.7689171084056621, "grad_norm": 0.3675989806652069, "learning_rate": 9.81733636408027e-05, "loss": 0.0137, "step": 13200 }, { "epoch": 0.7694996213665755, "grad_norm": 0.31780755519866943, "learning_rate": 9.816893258324649e-05, "loss": 0.0089, "step": 13210 }, { "epoch": 0.7700821343274887, "grad_norm": 0.42810994386672974, "learning_rate": 9.816449625800823e-05, "loss": 0.0115, "step": 13220 }, { "epoch": 0.7706646472884021, "grad_norm": 0.40573492646217346, "learning_rate": 9.816005466557308e-05, "loss": 0.0117, "step": 13230 }, { "epoch": 0.7712471602493155, "grad_norm": 0.33775392174720764, "learning_rate": 9.815560780642674e-05, "loss": 0.0122, "step": 13240 }, { "epoch": 0.7718296732102289, "grad_norm": 0.3058948218822479, "learning_rate": 9.815115568105555e-05, "loss": 0.0126, "step": 13250 }, { "epoch": 0.7724121861711423, "grad_norm": 0.3042391836643219, "learning_rate": 9.814669828994638e-05, "loss": 0.0131, "step": 13260 }, { "epoch": 0.7729946991320557, "grad_norm": 0.3347291648387909, "learning_rate": 9.814223563358665e-05, "loss": 0.0149, "step": 13270 }, { "epoch": 0.773577212092969, "grad_norm": 0.35931912064552307, "learning_rate": 9.813776771246443e-05, "loss": 0.0101, "step": 13280 }, { "epoch": 0.7741597250538824, "grad_norm": 0.5419941544532776, "learning_rate": 9.813329452706829e-05, "loss": 0.0132, "step": 13290 }, { "epoch": 0.7747422380147958, "grad_norm": 0.23611658811569214, "learning_rate": 9.812881607788744e-05, "loss": 0.0125, "step": 13300 }, { "epoch": 0.7753247509757092, "grad_norm": 0.2191879153251648, "learning_rate": 9.812433236541163e-05, "loss": 0.0104, "step": 13310 }, { "epoch": 0.7759072639366226, "grad_norm": 0.21858038008213043, "learning_rate": 9.811984339013116e-05, "loss": 0.013, "step": 13320 }, { "epoch": 0.776489776897536, "grad_norm": 0.3306126296520233, "learning_rate": 9.811534915253698e-05, "loss": 0.013, "step": 13330 }, { "epoch": 0.7770722898584493, "grad_norm": 0.2514423727989197, "learning_rate": 9.811084965312056e-05, "loss": 0.0101, "step": 13340 }, { "epoch": 0.7776548028193627, "grad_norm": 0.2867768406867981, "learning_rate": 9.810634489237396e-05, "loss": 0.0082, "step": 13350 }, { "epoch": 0.7782373157802761, "grad_norm": 0.2694252133369446, "learning_rate": 9.81018348707898e-05, "loss": 0.0111, "step": 13360 }, { "epoch": 0.7788198287411895, "grad_norm": 0.2057276964187622, "learning_rate": 9.809731958886131e-05, "loss": 0.0086, "step": 13370 }, { "epoch": 0.7794023417021029, "grad_norm": 0.2662864923477173, "learning_rate": 9.809279904708224e-05, "loss": 0.0104, "step": 13380 }, { "epoch": 0.7799848546630163, "grad_norm": 0.23671145737171173, "learning_rate": 9.808827324594699e-05, "loss": 0.0106, "step": 13390 }, { "epoch": 0.7805673676239296, "grad_norm": 0.19641733169555664, "learning_rate": 9.808374218595046e-05, "loss": 0.0082, "step": 13400 }, { "epoch": 0.781149880584843, "grad_norm": 0.31807267665863037, "learning_rate": 9.80792058675882e-05, "loss": 0.0096, "step": 13410 }, { "epoch": 0.7817323935457564, "grad_norm": 0.40290647745132446, "learning_rate": 9.807466429135627e-05, "loss": 0.0094, "step": 13420 }, { "epoch": 0.7823149065066698, "grad_norm": 0.24143372476100922, "learning_rate": 9.807011745775132e-05, "loss": 0.0104, "step": 13430 }, { "epoch": 0.7828974194675832, "grad_norm": 0.21661540865898132, "learning_rate": 9.806556536727061e-05, "loss": 0.0152, "step": 13440 }, { "epoch": 0.7834799324284966, "grad_norm": 0.22297504544258118, "learning_rate": 9.806100802041193e-05, "loss": 0.0105, "step": 13450 }, { "epoch": 0.78406244538941, "grad_norm": 0.26345938444137573, "learning_rate": 9.805644541767368e-05, "loss": 0.0111, "step": 13460 }, { "epoch": 0.7846449583503233, "grad_norm": 0.23876027762889862, "learning_rate": 9.805187755955478e-05, "loss": 0.0105, "step": 13470 }, { "epoch": 0.7852274713112367, "grad_norm": 0.20486889779567719, "learning_rate": 9.804730444655483e-05, "loss": 0.0072, "step": 13480 }, { "epoch": 0.7858099842721501, "grad_norm": 0.301163911819458, "learning_rate": 9.804272607917388e-05, "loss": 0.0156, "step": 13490 }, { "epoch": 0.7863924972330635, "grad_norm": 0.3384385406970978, "learning_rate": 9.803814245791265e-05, "loss": 0.0123, "step": 13500 }, { "epoch": 0.7869750101939769, "grad_norm": 0.25124239921569824, "learning_rate": 9.803355358327239e-05, "loss": 0.0127, "step": 13510 }, { "epoch": 0.7875575231548902, "grad_norm": 0.20830930769443512, "learning_rate": 9.802895945575492e-05, "loss": 0.0091, "step": 13520 }, { "epoch": 0.7881400361158035, "grad_norm": 0.18475474417209625, "learning_rate": 9.802436007586266e-05, "loss": 0.0088, "step": 13530 }, { "epoch": 0.7887225490767169, "grad_norm": 0.2761193513870239, "learning_rate": 9.801975544409858e-05, "loss": 0.0145, "step": 13540 }, { "epoch": 0.7893050620376303, "grad_norm": 0.2530314028263092, "learning_rate": 9.801514556096625e-05, "loss": 0.0134, "step": 13550 }, { "epoch": 0.7898875749985437, "grad_norm": 0.3253801763057709, "learning_rate": 9.801053042696977e-05, "loss": 0.0127, "step": 13560 }, { "epoch": 0.7904700879594571, "grad_norm": 0.26918548345565796, "learning_rate": 9.800591004261388e-05, "loss": 0.0082, "step": 13570 }, { "epoch": 0.7910526009203704, "grad_norm": 0.22753633558750153, "learning_rate": 9.800128440840385e-05, "loss": 0.0138, "step": 13580 }, { "epoch": 0.7916351138812838, "grad_norm": 0.34229299426078796, "learning_rate": 9.799665352484552e-05, "loss": 0.0106, "step": 13590 }, { "epoch": 0.7922176268421972, "grad_norm": 0.3219163417816162, "learning_rate": 9.799201739244532e-05, "loss": 0.0122, "step": 13600 }, { "epoch": 0.7928001398031106, "grad_norm": 0.35308024287223816, "learning_rate": 9.798737601171025e-05, "loss": 0.0113, "step": 13610 }, { "epoch": 0.793382652764024, "grad_norm": 0.35872310400009155, "learning_rate": 9.79827293831479e-05, "loss": 0.0112, "step": 13620 }, { "epoch": 0.7939651657249374, "grad_norm": 0.24670813977718353, "learning_rate": 9.797807750726638e-05, "loss": 0.011, "step": 13630 }, { "epoch": 0.7945476786858507, "grad_norm": 0.2595808506011963, "learning_rate": 9.797342038457446e-05, "loss": 0.0102, "step": 13640 }, { "epoch": 0.7951301916467641, "grad_norm": 0.33007222414016724, "learning_rate": 9.796875801558141e-05, "loss": 0.0094, "step": 13650 }, { "epoch": 0.7957127046076775, "grad_norm": 0.40683209896087646, "learning_rate": 9.79640904007971e-05, "loss": 0.0113, "step": 13660 }, { "epoch": 0.7962952175685909, "grad_norm": 0.629334568977356, "learning_rate": 9.795941754073199e-05, "loss": 0.0096, "step": 13670 }, { "epoch": 0.7968777305295043, "grad_norm": 0.5050783157348633, "learning_rate": 9.795473943589705e-05, "loss": 0.0142, "step": 13680 }, { "epoch": 0.7974602434904177, "grad_norm": 0.4388831555843353, "learning_rate": 9.795005608680394e-05, "loss": 0.0107, "step": 13690 }, { "epoch": 0.798042756451331, "grad_norm": 0.3246675133705139, "learning_rate": 9.794536749396477e-05, "loss": 0.0104, "step": 13700 }, { "epoch": 0.7986252694122444, "grad_norm": 0.3767477571964264, "learning_rate": 9.79406736578923e-05, "loss": 0.0116, "step": 13710 }, { "epoch": 0.7992077823731578, "grad_norm": 0.38453423976898193, "learning_rate": 9.793597457909984e-05, "loss": 0.009, "step": 13720 }, { "epoch": 0.7997902953340712, "grad_norm": 0.39086395502090454, "learning_rate": 9.793127025810127e-05, "loss": 0.0116, "step": 13730 }, { "epoch": 0.8003728082949846, "grad_norm": 0.28115928173065186, "learning_rate": 9.792656069541104e-05, "loss": 0.0091, "step": 13740 }, { "epoch": 0.800955321255898, "grad_norm": 0.3471229076385498, "learning_rate": 9.79218458915442e-05, "loss": 0.0172, "step": 13750 }, { "epoch": 0.8015378342168114, "grad_norm": 0.2664870321750641, "learning_rate": 9.791712584701634e-05, "loss": 0.01, "step": 13760 }, { "epoch": 0.8021203471777247, "grad_norm": 0.3772764205932617, "learning_rate": 9.791240056234364e-05, "loss": 0.0102, "step": 13770 }, { "epoch": 0.8027028601386381, "grad_norm": 0.26389750838279724, "learning_rate": 9.790767003804283e-05, "loss": 0.0097, "step": 13780 }, { "epoch": 0.8032853730995515, "grad_norm": 0.3378036916255951, "learning_rate": 9.790293427463126e-05, "loss": 0.0115, "step": 13790 }, { "epoch": 0.8038678860604649, "grad_norm": 0.2925207018852234, "learning_rate": 9.789819327262684e-05, "loss": 0.012, "step": 13800 }, { "epoch": 0.8044503990213783, "grad_norm": 0.25153666734695435, "learning_rate": 9.7893447032548e-05, "loss": 0.0145, "step": 13810 }, { "epoch": 0.8050329119822917, "grad_norm": 0.29271867871284485, "learning_rate": 9.78886955549138e-05, "loss": 0.0103, "step": 13820 }, { "epoch": 0.805615424943205, "grad_norm": 0.3172508776187897, "learning_rate": 9.788393884024387e-05, "loss": 0.0122, "step": 13830 }, { "epoch": 0.8061979379041183, "grad_norm": 0.3116541802883148, "learning_rate": 9.787917688905836e-05, "loss": 0.011, "step": 13840 }, { "epoch": 0.8067804508650317, "grad_norm": 0.2347998470067978, "learning_rate": 9.787440970187807e-05, "loss": 0.0092, "step": 13850 }, { "epoch": 0.8073629638259451, "grad_norm": 0.2998109459877014, "learning_rate": 9.786963727922429e-05, "loss": 0.0114, "step": 13860 }, { "epoch": 0.8079454767868585, "grad_norm": 0.2204401195049286, "learning_rate": 9.786485962161897e-05, "loss": 0.0091, "step": 13870 }, { "epoch": 0.8085279897477718, "grad_norm": 0.2570997476577759, "learning_rate": 9.786007672958455e-05, "loss": 0.0108, "step": 13880 }, { "epoch": 0.8091105027086852, "grad_norm": 0.2895021140575409, "learning_rate": 9.78552886036441e-05, "loss": 0.0113, "step": 13890 }, { "epoch": 0.8096930156695986, "grad_norm": 0.3349132537841797, "learning_rate": 9.785049524432124e-05, "loss": 0.0109, "step": 13900 }, { "epoch": 0.810275528630512, "grad_norm": 0.36986243724823, "learning_rate": 9.784569665214016e-05, "loss": 0.0095, "step": 13910 }, { "epoch": 0.8108580415914254, "grad_norm": 0.3068552017211914, "learning_rate": 9.784089282762563e-05, "loss": 0.0115, "step": 13920 }, { "epoch": 0.8114405545523388, "grad_norm": 0.3725542724132538, "learning_rate": 9.7836083771303e-05, "loss": 0.0092, "step": 13930 }, { "epoch": 0.8120230675132522, "grad_norm": 0.23730771243572235, "learning_rate": 9.783126948369817e-05, "loss": 0.0106, "step": 13940 }, { "epoch": 0.8126055804741655, "grad_norm": 0.2727254331111908, "learning_rate": 9.78264499653376e-05, "loss": 0.0119, "step": 13950 }, { "epoch": 0.8131880934350789, "grad_norm": 0.19818268716335297, "learning_rate": 9.782162521674838e-05, "loss": 0.0104, "step": 13960 }, { "epoch": 0.8137706063959923, "grad_norm": 0.25693196058273315, "learning_rate": 9.781679523845812e-05, "loss": 0.0088, "step": 13970 }, { "epoch": 0.8143531193569057, "grad_norm": 0.3451943099498749, "learning_rate": 9.781196003099502e-05, "loss": 0.0081, "step": 13980 }, { "epoch": 0.8149356323178191, "grad_norm": 0.18589337170124054, "learning_rate": 9.780711959488786e-05, "loss": 0.0109, "step": 13990 }, { "epoch": 0.8155181452787325, "grad_norm": 0.21624906361103058, "learning_rate": 9.780227393066599e-05, "loss": 0.0079, "step": 14000 }, { "epoch": 0.8161006582396458, "grad_norm": 0.2712023854255676, "learning_rate": 9.77974230388593e-05, "loss": 0.0111, "step": 14010 }, { "epoch": 0.8166831712005592, "grad_norm": 0.2408689707517624, "learning_rate": 9.779256691999829e-05, "loss": 0.0109, "step": 14020 }, { "epoch": 0.8172656841614726, "grad_norm": 0.20135101675987244, "learning_rate": 9.778770557461403e-05, "loss": 0.0102, "step": 14030 }, { "epoch": 0.817848197122386, "grad_norm": 0.3731323182582855, "learning_rate": 9.778283900323812e-05, "loss": 0.0104, "step": 14040 }, { "epoch": 0.8184307100832994, "grad_norm": 0.3019280433654785, "learning_rate": 9.777796720640277e-05, "loss": 0.0092, "step": 14050 }, { "epoch": 0.8190132230442128, "grad_norm": 0.35238948464393616, "learning_rate": 9.777309018464078e-05, "loss": 0.0099, "step": 14060 }, { "epoch": 0.8195957360051261, "grad_norm": 0.25682762265205383, "learning_rate": 9.776820793848547e-05, "loss": 0.0096, "step": 14070 }, { "epoch": 0.8201782489660395, "grad_norm": 0.20018793642520905, "learning_rate": 9.776332046847075e-05, "loss": 0.009, "step": 14080 }, { "epoch": 0.8207607619269529, "grad_norm": 0.2873724102973938, "learning_rate": 9.775842777513111e-05, "loss": 0.0086, "step": 14090 }, { "epoch": 0.8213432748878663, "grad_norm": 0.32729244232177734, "learning_rate": 9.775352985900163e-05, "loss": 0.0096, "step": 14100 }, { "epoch": 0.8219257878487797, "grad_norm": 0.3136199116706848, "learning_rate": 9.774862672061791e-05, "loss": 0.0074, "step": 14110 }, { "epoch": 0.8225083008096931, "grad_norm": 0.23101741075515747, "learning_rate": 9.774371836051616e-05, "loss": 0.0089, "step": 14120 }, { "epoch": 0.8230908137706064, "grad_norm": 0.22205685079097748, "learning_rate": 9.773880477923315e-05, "loss": 0.0115, "step": 14130 }, { "epoch": 0.8236733267315198, "grad_norm": 0.24280622601509094, "learning_rate": 9.773388597730623e-05, "loss": 0.0117, "step": 14140 }, { "epoch": 0.8242558396924332, "grad_norm": 0.27662593126296997, "learning_rate": 9.77289619552733e-05, "loss": 0.0083, "step": 14150 }, { "epoch": 0.8248383526533465, "grad_norm": 0.285942941904068, "learning_rate": 9.772403271367285e-05, "loss": 0.0097, "step": 14160 }, { "epoch": 0.8254208656142599, "grad_norm": 0.18537402153015137, "learning_rate": 9.771909825304396e-05, "loss": 0.0106, "step": 14170 }, { "epoch": 0.8260033785751733, "grad_norm": 0.33190780878067017, "learning_rate": 9.771415857392619e-05, "loss": 0.0093, "step": 14180 }, { "epoch": 0.8265858915360866, "grad_norm": 0.24320708215236664, "learning_rate": 9.770921367685978e-05, "loss": 0.0096, "step": 14190 }, { "epoch": 0.827168404497, "grad_norm": 0.439033180475235, "learning_rate": 9.770426356238551e-05, "loss": 0.01, "step": 14200 }, { "epoch": 0.8277509174579134, "grad_norm": 0.34821608662605286, "learning_rate": 9.769930823104469e-05, "loss": 0.0105, "step": 14210 }, { "epoch": 0.8283334304188268, "grad_norm": 0.25592732429504395, "learning_rate": 9.769434768337926e-05, "loss": 0.0114, "step": 14220 }, { "epoch": 0.8289159433797402, "grad_norm": 0.2684550881385803, "learning_rate": 9.768938191993164e-05, "loss": 0.0108, "step": 14230 }, { "epoch": 0.8294984563406536, "grad_norm": 0.23204803466796875, "learning_rate": 9.768441094124494e-05, "loss": 0.0095, "step": 14240 }, { "epoch": 0.8300809693015669, "grad_norm": 0.27098479866981506, "learning_rate": 9.767943474786275e-05, "loss": 0.0094, "step": 14250 }, { "epoch": 0.8306634822624803, "grad_norm": 0.28506168723106384, "learning_rate": 9.767445334032923e-05, "loss": 0.0118, "step": 14260 }, { "epoch": 0.8312459952233937, "grad_norm": 0.24639317393302917, "learning_rate": 9.766946671918919e-05, "loss": 0.008, "step": 14270 }, { "epoch": 0.8318285081843071, "grad_norm": 0.2728094160556793, "learning_rate": 9.766447488498796e-05, "loss": 0.0093, "step": 14280 }, { "epoch": 0.8324110211452205, "grad_norm": 0.23610907793045044, "learning_rate": 9.765947783827139e-05, "loss": 0.0075, "step": 14290 }, { "epoch": 0.8329935341061339, "grad_norm": 0.23757407069206238, "learning_rate": 9.765447557958599e-05, "loss": 0.0094, "step": 14300 }, { "epoch": 0.8335760470670472, "grad_norm": 0.2379852831363678, "learning_rate": 9.764946810947879e-05, "loss": 0.0075, "step": 14310 }, { "epoch": 0.8341585600279606, "grad_norm": 0.27939897775650024, "learning_rate": 9.764445542849738e-05, "loss": 0.0121, "step": 14320 }, { "epoch": 0.834741072988874, "grad_norm": 0.2856020927429199, "learning_rate": 9.763943753718998e-05, "loss": 0.013, "step": 14330 }, { "epoch": 0.8353235859497874, "grad_norm": 0.3911533057689667, "learning_rate": 9.76344144361053e-05, "loss": 0.0121, "step": 14340 }, { "epoch": 0.8359060989107008, "grad_norm": 0.371832937002182, "learning_rate": 9.762938612579269e-05, "loss": 0.0095, "step": 14350 }, { "epoch": 0.8364886118716142, "grad_norm": 0.2932489812374115, "learning_rate": 9.762435260680202e-05, "loss": 0.0117, "step": 14360 }, { "epoch": 0.8370711248325275, "grad_norm": 0.29618608951568604, "learning_rate": 9.761931387968373e-05, "loss": 0.0114, "step": 14370 }, { "epoch": 0.8376536377934409, "grad_norm": 0.1337878704071045, "learning_rate": 9.76142699449889e-05, "loss": 0.0111, "step": 14380 }, { "epoch": 0.8382361507543543, "grad_norm": 0.18253973126411438, "learning_rate": 9.760922080326908e-05, "loss": 0.01, "step": 14390 }, { "epoch": 0.8388186637152677, "grad_norm": 0.2930071949958801, "learning_rate": 9.760416645507644e-05, "loss": 0.0103, "step": 14400 }, { "epoch": 0.8394011766761811, "grad_norm": 0.2643076777458191, "learning_rate": 9.759910690096375e-05, "loss": 0.0115, "step": 14410 }, { "epoch": 0.8399836896370945, "grad_norm": 0.17144091427326202, "learning_rate": 9.759404214148429e-05, "loss": 0.0114, "step": 14420 }, { "epoch": 0.8405662025980078, "grad_norm": 0.31187060475349426, "learning_rate": 9.758897217719191e-05, "loss": 0.0087, "step": 14430 }, { "epoch": 0.8411487155589212, "grad_norm": 0.4038289189338684, "learning_rate": 9.758389700864113e-05, "loss": 0.0093, "step": 14440 }, { "epoch": 0.8417312285198346, "grad_norm": 0.3281000554561615, "learning_rate": 9.757881663638688e-05, "loss": 0.0138, "step": 14450 }, { "epoch": 0.842313741480748, "grad_norm": 0.40026986598968506, "learning_rate": 9.757373106098478e-05, "loss": 0.0102, "step": 14460 }, { "epoch": 0.8428962544416613, "grad_norm": 0.38246893882751465, "learning_rate": 9.756864028299097e-05, "loss": 0.0126, "step": 14470 }, { "epoch": 0.8434787674025747, "grad_norm": 0.3427959084510803, "learning_rate": 9.75635443029622e-05, "loss": 0.0152, "step": 14480 }, { "epoch": 0.844061280363488, "grad_norm": 0.3884137272834778, "learning_rate": 9.755844312145572e-05, "loss": 0.0108, "step": 14490 }, { "epoch": 0.8446437933244014, "grad_norm": 0.3329746723175049, "learning_rate": 9.755333673902941e-05, "loss": 0.0073, "step": 14500 }, { "epoch": 0.8452263062853148, "grad_norm": 0.26027557253837585, "learning_rate": 9.75482251562417e-05, "loss": 0.0085, "step": 14510 }, { "epoch": 0.8458088192462282, "grad_norm": 0.31182798743247986, "learning_rate": 9.754310837365155e-05, "loss": 0.008, "step": 14520 }, { "epoch": 0.8463913322071416, "grad_norm": 0.222249835729599, "learning_rate": 9.753798639181856e-05, "loss": 0.0099, "step": 14530 }, { "epoch": 0.846973845168055, "grad_norm": 0.41301125288009644, "learning_rate": 9.753285921130286e-05, "loss": 0.0113, "step": 14540 }, { "epoch": 0.8475563581289683, "grad_norm": 0.26030293107032776, "learning_rate": 9.752772683266512e-05, "loss": 0.0117, "step": 14550 }, { "epoch": 0.8481388710898817, "grad_norm": 0.22344250977039337, "learning_rate": 9.752258925646665e-05, "loss": 0.012, "step": 14560 }, { "epoch": 0.8487213840507951, "grad_norm": 0.4999072551727295, "learning_rate": 9.751744648326926e-05, "loss": 0.0124, "step": 14570 }, { "epoch": 0.8493038970117085, "grad_norm": 0.1556071788072586, "learning_rate": 9.751229851363536e-05, "loss": 0.0097, "step": 14580 }, { "epoch": 0.8498864099726219, "grad_norm": 0.26244014501571655, "learning_rate": 9.750714534812793e-05, "loss": 0.0094, "step": 14590 }, { "epoch": 0.8504689229335353, "grad_norm": 0.2810560464859009, "learning_rate": 9.750198698731053e-05, "loss": 0.0106, "step": 14600 }, { "epoch": 0.8510514358944486, "grad_norm": 0.24330292642116547, "learning_rate": 9.749682343174722e-05, "loss": 0.0115, "step": 14610 }, { "epoch": 0.851633948855362, "grad_norm": 0.29982393980026245, "learning_rate": 9.749165468200272e-05, "loss": 0.0104, "step": 14620 }, { "epoch": 0.8522164618162754, "grad_norm": 0.28086981177330017, "learning_rate": 9.748648073864229e-05, "loss": 0.0126, "step": 14630 }, { "epoch": 0.8527989747771888, "grad_norm": 0.2285418063402176, "learning_rate": 9.748130160223168e-05, "loss": 0.0108, "step": 14640 }, { "epoch": 0.8533814877381022, "grad_norm": 0.26484277844429016, "learning_rate": 9.747611727333734e-05, "loss": 0.0082, "step": 14650 }, { "epoch": 0.8539640006990156, "grad_norm": 0.1948658972978592, "learning_rate": 9.74709277525262e-05, "loss": 0.009, "step": 14660 }, { "epoch": 0.854546513659929, "grad_norm": 0.24820363521575928, "learning_rate": 9.746573304036576e-05, "loss": 0.0081, "step": 14670 }, { "epoch": 0.8551290266208423, "grad_norm": 0.2779420018196106, "learning_rate": 9.746053313742412e-05, "loss": 0.0099, "step": 14680 }, { "epoch": 0.8557115395817557, "grad_norm": 0.31259065866470337, "learning_rate": 9.745532804426994e-05, "loss": 0.0082, "step": 14690 }, { "epoch": 0.8562940525426691, "grad_norm": 0.23451635241508484, "learning_rate": 9.745011776147242e-05, "loss": 0.0091, "step": 14700 }, { "epoch": 0.8568765655035825, "grad_norm": 0.2065698504447937, "learning_rate": 9.744490228960138e-05, "loss": 0.0159, "step": 14710 }, { "epoch": 0.8574590784644959, "grad_norm": 0.3244801461696625, "learning_rate": 9.743968162922713e-05, "loss": 0.0092, "step": 14720 }, { "epoch": 0.8580415914254093, "grad_norm": 0.27111542224884033, "learning_rate": 9.743445578092064e-05, "loss": 0.0102, "step": 14730 }, { "epoch": 0.8586241043863226, "grad_norm": 0.4484125077724457, "learning_rate": 9.742922474525338e-05, "loss": 0.0157, "step": 14740 }, { "epoch": 0.859206617347236, "grad_norm": 0.2820877730846405, "learning_rate": 9.742398852279741e-05, "loss": 0.0115, "step": 14750 }, { "epoch": 0.8597891303081494, "grad_norm": 0.24077697098255157, "learning_rate": 9.741874711412535e-05, "loss": 0.011, "step": 14760 }, { "epoch": 0.8603716432690628, "grad_norm": 0.39931720495224, "learning_rate": 9.741350051981042e-05, "loss": 0.0135, "step": 14770 }, { "epoch": 0.8609541562299761, "grad_norm": 0.22458259761333466, "learning_rate": 9.740824874042633e-05, "loss": 0.0114, "step": 14780 }, { "epoch": 0.8615366691908894, "grad_norm": 0.18846219778060913, "learning_rate": 9.740299177654746e-05, "loss": 0.0103, "step": 14790 }, { "epoch": 0.8621191821518028, "grad_norm": 0.29485419392585754, "learning_rate": 9.739772962874867e-05, "loss": 0.0068, "step": 14800 }, { "epoch": 0.8627016951127162, "grad_norm": 0.30281686782836914, "learning_rate": 9.739246229760541e-05, "loss": 0.0087, "step": 14810 }, { "epoch": 0.8632842080736296, "grad_norm": 0.2676441967487335, "learning_rate": 9.738718978369376e-05, "loss": 0.0083, "step": 14820 }, { "epoch": 0.863866721034543, "grad_norm": 0.28544166684150696, "learning_rate": 9.738191208759025e-05, "loss": 0.0103, "step": 14830 }, { "epoch": 0.8644492339954564, "grad_norm": 0.28684213757514954, "learning_rate": 9.73766292098721e-05, "loss": 0.009, "step": 14840 }, { "epoch": 0.8650317469563698, "grad_norm": 0.27581244707107544, "learning_rate": 9.737134115111699e-05, "loss": 0.0086, "step": 14850 }, { "epoch": 0.8656142599172831, "grad_norm": 0.2717691659927368, "learning_rate": 9.736604791190323e-05, "loss": 0.01, "step": 14860 }, { "epoch": 0.8661967728781965, "grad_norm": 0.2783054709434509, "learning_rate": 9.73607494928097e-05, "loss": 0.0094, "step": 14870 }, { "epoch": 0.8667792858391099, "grad_norm": 0.21667411923408508, "learning_rate": 9.735544589441581e-05, "loss": 0.0084, "step": 14880 }, { "epoch": 0.8673617988000233, "grad_norm": 0.18523572385311127, "learning_rate": 9.735013711730154e-05, "loss": 0.0085, "step": 14890 }, { "epoch": 0.8679443117609367, "grad_norm": 0.15348385274410248, "learning_rate": 9.734482316204747e-05, "loss": 0.0085, "step": 14900 }, { "epoch": 0.86852682472185, "grad_norm": 0.3414808511734009, "learning_rate": 9.733950402923473e-05, "loss": 0.0083, "step": 14910 }, { "epoch": 0.8691093376827634, "grad_norm": 0.36065009236335754, "learning_rate": 9.7334179719445e-05, "loss": 0.0124, "step": 14920 }, { "epoch": 0.8696918506436768, "grad_norm": 0.2742668688297272, "learning_rate": 9.732885023326053e-05, "loss": 0.0081, "step": 14930 }, { "epoch": 0.8702743636045902, "grad_norm": 0.34874412417411804, "learning_rate": 9.732351557126418e-05, "loss": 0.0098, "step": 14940 }, { "epoch": 0.8708568765655036, "grad_norm": 0.1615874320268631, "learning_rate": 9.731817573403929e-05, "loss": 0.0099, "step": 14950 }, { "epoch": 0.871439389526417, "grad_norm": 0.2603660523891449, "learning_rate": 9.731283072216985e-05, "loss": 0.0082, "step": 14960 }, { "epoch": 0.8720219024873304, "grad_norm": 0.3280786871910095, "learning_rate": 9.730748053624039e-05, "loss": 0.0082, "step": 14970 }, { "epoch": 0.8726044154482437, "grad_norm": 0.3203558027744293, "learning_rate": 9.730212517683598e-05, "loss": 0.0089, "step": 14980 }, { "epoch": 0.8731869284091571, "grad_norm": 0.29208970069885254, "learning_rate": 9.729676464454228e-05, "loss": 0.012, "step": 14990 }, { "epoch": 0.8737694413700705, "grad_norm": 0.38670334219932556, "learning_rate": 9.72913989399455e-05, "loss": 0.0085, "step": 15000 }, { "epoch": 0.8743519543309839, "grad_norm": 0.3031385540962219, "learning_rate": 9.728602806363242e-05, "loss": 0.009, "step": 15010 }, { "epoch": 0.8749344672918973, "grad_norm": 0.20666909217834473, "learning_rate": 9.728065201619043e-05, "loss": 0.0081, "step": 15020 }, { "epoch": 0.8755169802528107, "grad_norm": 0.23560909926891327, "learning_rate": 9.727527079820742e-05, "loss": 0.0129, "step": 15030 }, { "epoch": 0.876099493213724, "grad_norm": 0.29215702414512634, "learning_rate": 9.726988441027186e-05, "loss": 0.011, "step": 15040 }, { "epoch": 0.8766820061746374, "grad_norm": 0.287646621465683, "learning_rate": 9.726449285297281e-05, "loss": 0.01, "step": 15050 }, { "epoch": 0.8772645191355508, "grad_norm": 0.33430978655815125, "learning_rate": 9.72590961268999e-05, "loss": 0.0104, "step": 15060 }, { "epoch": 0.8778470320964642, "grad_norm": 0.28966373205184937, "learning_rate": 9.725369423264328e-05, "loss": 0.0096, "step": 15070 }, { "epoch": 0.8784295450573776, "grad_norm": 0.3987903594970703, "learning_rate": 9.72482871707937e-05, "loss": 0.0085, "step": 15080 }, { "epoch": 0.8790120580182909, "grad_norm": 0.1999264806509018, "learning_rate": 9.724287494194247e-05, "loss": 0.0095, "step": 15090 }, { "epoch": 0.8795945709792042, "grad_norm": 0.38007885217666626, "learning_rate": 9.723745754668147e-05, "loss": 0.0103, "step": 15100 }, { "epoch": 0.8801770839401176, "grad_norm": 0.42199069261550903, "learning_rate": 9.723203498560313e-05, "loss": 0.0116, "step": 15110 }, { "epoch": 0.880759596901031, "grad_norm": 0.20755748450756073, "learning_rate": 9.722660725930046e-05, "loss": 0.013, "step": 15120 }, { "epoch": 0.8813421098619444, "grad_norm": 0.3063476085662842, "learning_rate": 9.722117436836702e-05, "loss": 0.0116, "step": 15130 }, { "epoch": 0.8819246228228578, "grad_norm": 0.20493236184120178, "learning_rate": 9.721573631339696e-05, "loss": 0.0087, "step": 15140 }, { "epoch": 0.8825071357837712, "grad_norm": 0.25990045070648193, "learning_rate": 9.721029309498494e-05, "loss": 0.0086, "step": 15150 }, { "epoch": 0.8830896487446845, "grad_norm": 0.21768619120121002, "learning_rate": 9.720484471372627e-05, "loss": 0.0088, "step": 15160 }, { "epoch": 0.8836721617055979, "grad_norm": 0.30064254999160767, "learning_rate": 9.719939117021673e-05, "loss": 0.0108, "step": 15170 }, { "epoch": 0.8842546746665113, "grad_norm": 0.30118998885154724, "learning_rate": 9.719393246505275e-05, "loss": 0.0077, "step": 15180 }, { "epoch": 0.8848371876274247, "grad_norm": 0.2041505128145218, "learning_rate": 9.718846859883128e-05, "loss": 0.009, "step": 15190 }, { "epoch": 0.8854197005883381, "grad_norm": 0.2770838141441345, "learning_rate": 9.718299957214982e-05, "loss": 0.0085, "step": 15200 }, { "epoch": 0.8860022135492515, "grad_norm": 0.34251463413238525, "learning_rate": 9.717752538560646e-05, "loss": 0.0093, "step": 15210 }, { "epoch": 0.8865847265101648, "grad_norm": 0.27091965079307556, "learning_rate": 9.717204603979986e-05, "loss": 0.0079, "step": 15220 }, { "epoch": 0.8871672394710782, "grad_norm": 0.32255449891090393, "learning_rate": 9.716656153532922e-05, "loss": 0.009, "step": 15230 }, { "epoch": 0.8877497524319916, "grad_norm": 0.3271324038505554, "learning_rate": 9.716107187279434e-05, "loss": 0.0111, "step": 15240 }, { "epoch": 0.888332265392905, "grad_norm": 0.3336637020111084, "learning_rate": 9.715557705279555e-05, "loss": 0.0143, "step": 15250 }, { "epoch": 0.8889147783538184, "grad_norm": 0.3623363971710205, "learning_rate": 9.715007707593372e-05, "loss": 0.0095, "step": 15260 }, { "epoch": 0.8894972913147318, "grad_norm": 0.39904308319091797, "learning_rate": 9.714457194281036e-05, "loss": 0.0122, "step": 15270 }, { "epoch": 0.8900798042756451, "grad_norm": 0.31550905108451843, "learning_rate": 9.713906165402751e-05, "loss": 0.0096, "step": 15280 }, { "epoch": 0.8906623172365585, "grad_norm": 0.2159205973148346, "learning_rate": 9.713354621018774e-05, "loss": 0.0118, "step": 15290 }, { "epoch": 0.8912448301974719, "grad_norm": 0.22835765779018402, "learning_rate": 9.712802561189422e-05, "loss": 0.0087, "step": 15300 }, { "epoch": 0.8918273431583853, "grad_norm": 0.24292369186878204, "learning_rate": 9.712249985975069e-05, "loss": 0.0105, "step": 15310 }, { "epoch": 0.8924098561192987, "grad_norm": 0.3178677260875702, "learning_rate": 9.71169689543614e-05, "loss": 0.0138, "step": 15320 }, { "epoch": 0.8929923690802121, "grad_norm": 0.23733508586883545, "learning_rate": 9.711143289633123e-05, "loss": 0.0119, "step": 15330 }, { "epoch": 0.8935748820411255, "grad_norm": 0.3010134696960449, "learning_rate": 9.710589168626561e-05, "loss": 0.0099, "step": 15340 }, { "epoch": 0.8941573950020388, "grad_norm": 0.1538250595331192, "learning_rate": 9.710034532477048e-05, "loss": 0.0086, "step": 15350 }, { "epoch": 0.8947399079629522, "grad_norm": 0.20818544924259186, "learning_rate": 9.709479381245239e-05, "loss": 0.0077, "step": 15360 }, { "epoch": 0.8953224209238656, "grad_norm": 0.3080218732357025, "learning_rate": 9.708923714991847e-05, "loss": 0.0071, "step": 15370 }, { "epoch": 0.895904933884779, "grad_norm": 0.20693665742874146, "learning_rate": 9.708367533777638e-05, "loss": 0.01, "step": 15380 }, { "epoch": 0.8964874468456924, "grad_norm": 0.2104368656873703, "learning_rate": 9.707810837663431e-05, "loss": 0.013, "step": 15390 }, { "epoch": 0.8970699598066056, "grad_norm": 0.2694410979747772, "learning_rate": 9.707253626710113e-05, "loss": 0.0109, "step": 15400 }, { "epoch": 0.897652472767519, "grad_norm": 0.2324301153421402, "learning_rate": 9.706695900978613e-05, "loss": 0.0091, "step": 15410 }, { "epoch": 0.8982349857284324, "grad_norm": 0.3066050112247467, "learning_rate": 9.706137660529926e-05, "loss": 0.0132, "step": 15420 }, { "epoch": 0.8988174986893458, "grad_norm": 0.21021494269371033, "learning_rate": 9.705578905425101e-05, "loss": 0.0083, "step": 15430 }, { "epoch": 0.8994000116502592, "grad_norm": 0.2369978129863739, "learning_rate": 9.705019635725241e-05, "loss": 0.0086, "step": 15440 }, { "epoch": 0.8999825246111726, "grad_norm": 0.31932052969932556, "learning_rate": 9.704459851491508e-05, "loss": 0.0119, "step": 15450 }, { "epoch": 0.900565037572086, "grad_norm": 0.2777937650680542, "learning_rate": 9.703899552785118e-05, "loss": 0.0093, "step": 15460 }, { "epoch": 0.9011475505329993, "grad_norm": 0.379960298538208, "learning_rate": 9.703338739667346e-05, "loss": 0.0102, "step": 15470 }, { "epoch": 0.9017300634939127, "grad_norm": 0.47795578837394714, "learning_rate": 9.70277741219952e-05, "loss": 0.0101, "step": 15480 }, { "epoch": 0.9023125764548261, "grad_norm": 0.29468977451324463, "learning_rate": 9.702215570443027e-05, "loss": 0.0124, "step": 15490 }, { "epoch": 0.9028950894157395, "grad_norm": 0.31233781576156616, "learning_rate": 9.701653214459309e-05, "loss": 0.0104, "step": 15500 }, { "epoch": 0.9034776023766529, "grad_norm": 0.3748420476913452, "learning_rate": 9.701090344309865e-05, "loss": 0.0094, "step": 15510 }, { "epoch": 0.9040601153375663, "grad_norm": 0.34255436062812805, "learning_rate": 9.700526960056247e-05, "loss": 0.0102, "step": 15520 }, { "epoch": 0.9046426282984796, "grad_norm": 0.3148612678050995, "learning_rate": 9.699963061760068e-05, "loss": 0.0168, "step": 15530 }, { "epoch": 0.905225141259393, "grad_norm": 0.32877713441848755, "learning_rate": 9.699398649482997e-05, "loss": 0.0166, "step": 15540 }, { "epoch": 0.9058076542203064, "grad_norm": 0.22209811210632324, "learning_rate": 9.698833723286753e-05, "loss": 0.011, "step": 15550 }, { "epoch": 0.9063901671812198, "grad_norm": 0.38074564933776855, "learning_rate": 9.698268283233118e-05, "loss": 0.0093, "step": 15560 }, { "epoch": 0.9069726801421332, "grad_norm": 0.3867954909801483, "learning_rate": 9.697702329383929e-05, "loss": 0.0095, "step": 15570 }, { "epoch": 0.9075551931030466, "grad_norm": 0.3680252432823181, "learning_rate": 9.697135861801074e-05, "loss": 0.0111, "step": 15580 }, { "epoch": 0.9081377060639599, "grad_norm": 0.25233256816864014, "learning_rate": 9.696568880546505e-05, "loss": 0.0103, "step": 15590 }, { "epoch": 0.9087202190248733, "grad_norm": 0.1801777184009552, "learning_rate": 9.696001385682223e-05, "loss": 0.0101, "step": 15600 }, { "epoch": 0.9093027319857867, "grad_norm": 0.25442376732826233, "learning_rate": 9.695433377270291e-05, "loss": 0.0088, "step": 15610 }, { "epoch": 0.9098852449467001, "grad_norm": 0.18827004730701447, "learning_rate": 9.694864855372824e-05, "loss": 0.0096, "step": 15620 }, { "epoch": 0.9104677579076135, "grad_norm": 0.21297258138656616, "learning_rate": 9.694295820051995e-05, "loss": 0.0076, "step": 15630 }, { "epoch": 0.9110502708685269, "grad_norm": 0.20297551155090332, "learning_rate": 9.693726271370032e-05, "loss": 0.0088, "step": 15640 }, { "epoch": 0.9116327838294402, "grad_norm": 0.269288569688797, "learning_rate": 9.693156209389221e-05, "loss": 0.013, "step": 15650 }, { "epoch": 0.9122152967903536, "grad_norm": 0.26717326045036316, "learning_rate": 9.692585634171905e-05, "loss": 0.0101, "step": 15660 }, { "epoch": 0.912797809751267, "grad_norm": 0.2755376100540161, "learning_rate": 9.692014545780476e-05, "loss": 0.0099, "step": 15670 }, { "epoch": 0.9133803227121804, "grad_norm": 0.33283963799476624, "learning_rate": 9.691442944277393e-05, "loss": 0.009, "step": 15680 }, { "epoch": 0.9139628356730938, "grad_norm": 0.20162728428840637, "learning_rate": 9.690870829725162e-05, "loss": 0.0091, "step": 15690 }, { "epoch": 0.9145453486340072, "grad_norm": 0.4073590338230133, "learning_rate": 9.69029820218635e-05, "loss": 0.0086, "step": 15700 }, { "epoch": 0.9151278615949204, "grad_norm": 0.21971596777439117, "learning_rate": 9.689725061723579e-05, "loss": 0.0109, "step": 15710 }, { "epoch": 0.9157103745558338, "grad_norm": 0.2323032021522522, "learning_rate": 9.689151408399527e-05, "loss": 0.0114, "step": 15720 }, { "epoch": 0.9162928875167472, "grad_norm": 0.29482191801071167, "learning_rate": 9.688577242276924e-05, "loss": 0.0087, "step": 15730 }, { "epoch": 0.9168754004776606, "grad_norm": 0.273419052362442, "learning_rate": 9.688002563418566e-05, "loss": 0.0098, "step": 15740 }, { "epoch": 0.917457913438574, "grad_norm": 0.30928051471710205, "learning_rate": 9.687427371887293e-05, "loss": 0.0086, "step": 15750 }, { "epoch": 0.9180404263994874, "grad_norm": 0.26768285036087036, "learning_rate": 9.686851667746012e-05, "loss": 0.0105, "step": 15760 }, { "epoch": 0.9186229393604007, "grad_norm": 0.4199555814266205, "learning_rate": 9.686275451057677e-05, "loss": 0.0122, "step": 15770 }, { "epoch": 0.9192054523213141, "grad_norm": 0.30215153098106384, "learning_rate": 9.685698721885308e-05, "loss": 0.0104, "step": 15780 }, { "epoch": 0.9197879652822275, "grad_norm": 0.32820796966552734, "learning_rate": 9.68512148029197e-05, "loss": 0.009, "step": 15790 }, { "epoch": 0.9203704782431409, "grad_norm": 0.28191787004470825, "learning_rate": 9.684543726340791e-05, "loss": 0.0089, "step": 15800 }, { "epoch": 0.9209529912040543, "grad_norm": 0.32282325625419617, "learning_rate": 9.683965460094952e-05, "loss": 0.0094, "step": 15810 }, { "epoch": 0.9215355041649677, "grad_norm": 0.24987705051898956, "learning_rate": 9.683386681617694e-05, "loss": 0.012, "step": 15820 }, { "epoch": 0.922118017125881, "grad_norm": 0.21561259031295776, "learning_rate": 9.68280739097231e-05, "loss": 0.0099, "step": 15830 }, { "epoch": 0.9227005300867944, "grad_norm": 0.26100295782089233, "learning_rate": 9.682227588222148e-05, "loss": 0.0096, "step": 15840 }, { "epoch": 0.9232830430477078, "grad_norm": 0.20140300691127777, "learning_rate": 9.681647273430618e-05, "loss": 0.0125, "step": 15850 }, { "epoch": 0.9238655560086212, "grad_norm": 0.20016059279441833, "learning_rate": 9.681066446661182e-05, "loss": 0.0076, "step": 15860 }, { "epoch": 0.9244480689695346, "grad_norm": 0.2669161856174469, "learning_rate": 9.680485107977357e-05, "loss": 0.0092, "step": 15870 }, { "epoch": 0.925030581930448, "grad_norm": 0.20481130480766296, "learning_rate": 9.679903257442716e-05, "loss": 0.0131, "step": 15880 }, { "epoch": 0.9256130948913613, "grad_norm": 0.2843770980834961, "learning_rate": 9.679320895120891e-05, "loss": 0.0097, "step": 15890 }, { "epoch": 0.9261956078522747, "grad_norm": 0.38216233253479004, "learning_rate": 9.67873802107557e-05, "loss": 0.0122, "step": 15900 }, { "epoch": 0.9267781208131881, "grad_norm": 0.2265479415655136, "learning_rate": 9.67815463537049e-05, "loss": 0.0121, "step": 15910 }, { "epoch": 0.9273606337741015, "grad_norm": 0.2548191547393799, "learning_rate": 9.677570738069457e-05, "loss": 0.0098, "step": 15920 }, { "epoch": 0.9279431467350149, "grad_norm": 0.18717391788959503, "learning_rate": 9.676986329236318e-05, "loss": 0.0159, "step": 15930 }, { "epoch": 0.9285256596959283, "grad_norm": 0.23326517641544342, "learning_rate": 9.676401408934987e-05, "loss": 0.0082, "step": 15940 }, { "epoch": 0.9291081726568416, "grad_norm": 0.34242764115333557, "learning_rate": 9.675815977229428e-05, "loss": 0.0101, "step": 15950 }, { "epoch": 0.929690685617755, "grad_norm": 0.4530187249183655, "learning_rate": 9.675230034183664e-05, "loss": 0.0129, "step": 15960 }, { "epoch": 0.9302731985786684, "grad_norm": 0.3714357614517212, "learning_rate": 9.674643579861773e-05, "loss": 0.0109, "step": 15970 }, { "epoch": 0.9308557115395818, "grad_norm": 0.2265966683626175, "learning_rate": 9.674056614327886e-05, "loss": 0.0089, "step": 15980 }, { "epoch": 0.9314382245004952, "grad_norm": 0.3329233229160309, "learning_rate": 9.673469137646198e-05, "loss": 0.0083, "step": 15990 }, { "epoch": 0.9320207374614086, "grad_norm": 0.300405353307724, "learning_rate": 9.67288114988095e-05, "loss": 0.008, "step": 16000 }, { "epoch": 0.932603250422322, "grad_norm": 0.2493903934955597, "learning_rate": 9.672292651096447e-05, "loss": 0.0073, "step": 16010 }, { "epoch": 0.9331857633832352, "grad_norm": 0.27534037828445435, "learning_rate": 9.671703641357042e-05, "loss": 0.0059, "step": 16020 }, { "epoch": 0.9337682763441486, "grad_norm": 0.20184127986431122, "learning_rate": 9.67111412072715e-05, "loss": 0.0077, "step": 16030 }, { "epoch": 0.934350789305062, "grad_norm": 0.23311267793178558, "learning_rate": 9.670524089271242e-05, "loss": 0.0103, "step": 16040 }, { "epoch": 0.9349333022659754, "grad_norm": 0.22423334419727325, "learning_rate": 9.669933547053842e-05, "loss": 0.0101, "step": 16050 }, { "epoch": 0.9355158152268888, "grad_norm": 0.30710041522979736, "learning_rate": 9.669342494139531e-05, "loss": 0.0081, "step": 16060 }, { "epoch": 0.9360983281878021, "grad_norm": 0.25574588775634766, "learning_rate": 9.668750930592943e-05, "loss": 0.0089, "step": 16070 }, { "epoch": 0.9366808411487155, "grad_norm": 0.2827922999858856, "learning_rate": 9.668158856478775e-05, "loss": 0.0104, "step": 16080 }, { "epoch": 0.9372633541096289, "grad_norm": 0.2747807502746582, "learning_rate": 9.66756627186177e-05, "loss": 0.0081, "step": 16090 }, { "epoch": 0.9378458670705423, "grad_norm": 0.15155217051506042, "learning_rate": 9.666973176806737e-05, "loss": 0.0088, "step": 16100 }, { "epoch": 0.9384283800314557, "grad_norm": 0.3908017873764038, "learning_rate": 9.666379571378534e-05, "loss": 0.0106, "step": 16110 }, { "epoch": 0.9390108929923691, "grad_norm": 0.23887716233730316, "learning_rate": 9.665785455642076e-05, "loss": 0.0091, "step": 16120 }, { "epoch": 0.9395934059532824, "grad_norm": 0.2135385274887085, "learning_rate": 9.665190829662337e-05, "loss": 0.0071, "step": 16130 }, { "epoch": 0.9401759189141958, "grad_norm": 0.22224880754947662, "learning_rate": 9.664595693504342e-05, "loss": 0.0093, "step": 16140 }, { "epoch": 0.9407584318751092, "grad_norm": 0.2852906584739685, "learning_rate": 9.664000047233175e-05, "loss": 0.0104, "step": 16150 }, { "epoch": 0.9413409448360226, "grad_norm": 0.2898379862308502, "learning_rate": 9.663403890913976e-05, "loss": 0.0134, "step": 16160 }, { "epoch": 0.941923457796936, "grad_norm": 0.23455210030078888, "learning_rate": 9.662807224611938e-05, "loss": 0.0074, "step": 16170 }, { "epoch": 0.9425059707578494, "grad_norm": 0.25744155049324036, "learning_rate": 9.662210048392311e-05, "loss": 0.0135, "step": 16180 }, { "epoch": 0.9430884837187627, "grad_norm": 0.2177501767873764, "learning_rate": 9.661612362320405e-05, "loss": 0.0095, "step": 16190 }, { "epoch": 0.9436709966796761, "grad_norm": 0.19697417318820953, "learning_rate": 9.661014166461579e-05, "loss": 0.0116, "step": 16200 }, { "epoch": 0.9442535096405895, "grad_norm": 0.1771872490644455, "learning_rate": 9.66041546088125e-05, "loss": 0.0069, "step": 16210 }, { "epoch": 0.9448360226015029, "grad_norm": 0.4138948321342468, "learning_rate": 9.659816245644895e-05, "loss": 0.0098, "step": 16220 }, { "epoch": 0.9454185355624163, "grad_norm": 0.22246761620044708, "learning_rate": 9.65921652081804e-05, "loss": 0.0102, "step": 16230 }, { "epoch": 0.9460010485233297, "grad_norm": 0.29339176416397095, "learning_rate": 9.658616286466271e-05, "loss": 0.0097, "step": 16240 }, { "epoch": 0.946583561484243, "grad_norm": 0.32499024271965027, "learning_rate": 9.65801554265523e-05, "loss": 0.009, "step": 16250 }, { "epoch": 0.9471660744451564, "grad_norm": 0.21990960836410522, "learning_rate": 9.657414289450612e-05, "loss": 0.0113, "step": 16260 }, { "epoch": 0.9477485874060698, "grad_norm": 0.23116779327392578, "learning_rate": 9.656812526918171e-05, "loss": 0.009, "step": 16270 }, { "epoch": 0.9483311003669832, "grad_norm": 0.31842973828315735, "learning_rate": 9.656210255123712e-05, "loss": 0.0094, "step": 16280 }, { "epoch": 0.9489136133278966, "grad_norm": 0.1891745924949646, "learning_rate": 9.6556074741331e-05, "loss": 0.0122, "step": 16290 }, { "epoch": 0.94949612628881, "grad_norm": 0.31284651160240173, "learning_rate": 9.655004184012256e-05, "loss": 0.0088, "step": 16300 }, { "epoch": 0.9500786392497234, "grad_norm": 0.36398646235466003, "learning_rate": 9.654400384827152e-05, "loss": 0.0107, "step": 16310 }, { "epoch": 0.9506611522106367, "grad_norm": 0.16155967116355896, "learning_rate": 9.653796076643818e-05, "loss": 0.0079, "step": 16320 }, { "epoch": 0.95124366517155, "grad_norm": 0.17824356257915497, "learning_rate": 9.653191259528344e-05, "loss": 0.0094, "step": 16330 }, { "epoch": 0.9518261781324634, "grad_norm": 0.2052920013666153, "learning_rate": 9.65258593354687e-05, "loss": 0.008, "step": 16340 }, { "epoch": 0.9524086910933768, "grad_norm": 0.23825603723526, "learning_rate": 9.651980098765591e-05, "loss": 0.0086, "step": 16350 }, { "epoch": 0.9529912040542902, "grad_norm": 0.21125264465808868, "learning_rate": 9.651373755250765e-05, "loss": 0.0093, "step": 16360 }, { "epoch": 0.9535737170152035, "grad_norm": 0.34205567836761475, "learning_rate": 9.650766903068697e-05, "loss": 0.0083, "step": 16370 }, { "epoch": 0.9541562299761169, "grad_norm": 0.28792518377304077, "learning_rate": 9.650159542285753e-05, "loss": 0.0119, "step": 16380 }, { "epoch": 0.9547387429370303, "grad_norm": 0.2658522129058838, "learning_rate": 9.649551672968353e-05, "loss": 0.0109, "step": 16390 }, { "epoch": 0.9553212558979437, "grad_norm": 0.33001068234443665, "learning_rate": 9.648943295182973e-05, "loss": 0.0102, "step": 16400 }, { "epoch": 0.9559037688588571, "grad_norm": 0.24698160588741302, "learning_rate": 9.648334408996144e-05, "loss": 0.01, "step": 16410 }, { "epoch": 0.9564862818197705, "grad_norm": 0.20906685292720795, "learning_rate": 9.647725014474452e-05, "loss": 0.0073, "step": 16420 }, { "epoch": 0.9570687947806839, "grad_norm": 0.17769555747509003, "learning_rate": 9.64711511168454e-05, "loss": 0.0078, "step": 16430 }, { "epoch": 0.9576513077415972, "grad_norm": 0.21248449385166168, "learning_rate": 9.646504700693108e-05, "loss": 0.0087, "step": 16440 }, { "epoch": 0.9582338207025106, "grad_norm": 0.19710808992385864, "learning_rate": 9.645893781566907e-05, "loss": 0.0078, "step": 16450 }, { "epoch": 0.958816333663424, "grad_norm": 0.3505627512931824, "learning_rate": 9.645282354372744e-05, "loss": 0.009, "step": 16460 }, { "epoch": 0.9593988466243374, "grad_norm": 0.2904420495033264, "learning_rate": 9.644670419177491e-05, "loss": 0.0091, "step": 16470 }, { "epoch": 0.9599813595852508, "grad_norm": 0.2943032383918762, "learning_rate": 9.644057976048062e-05, "loss": 0.0097, "step": 16480 }, { "epoch": 0.9605638725461642, "grad_norm": 0.3263625204563141, "learning_rate": 9.643445025051435e-05, "loss": 0.0118, "step": 16490 }, { "epoch": 0.9611463855070775, "grad_norm": 0.1629473865032196, "learning_rate": 9.642831566254641e-05, "loss": 0.0099, "step": 16500 }, { "epoch": 0.9617288984679909, "grad_norm": 0.23222537338733673, "learning_rate": 9.642217599724769e-05, "loss": 0.0083, "step": 16510 }, { "epoch": 0.9623114114289043, "grad_norm": 0.35047051310539246, "learning_rate": 9.64160312552896e-05, "loss": 0.01, "step": 16520 }, { "epoch": 0.9628939243898177, "grad_norm": 0.33238905668258667, "learning_rate": 9.64098814373441e-05, "loss": 0.009, "step": 16530 }, { "epoch": 0.9634764373507311, "grad_norm": 0.2571895122528076, "learning_rate": 9.640372654408374e-05, "loss": 0.0096, "step": 16540 }, { "epoch": 0.9640589503116445, "grad_norm": 0.5036671757698059, "learning_rate": 9.639756657618162e-05, "loss": 0.0105, "step": 16550 }, { "epoch": 0.9646414632725578, "grad_norm": 0.2710375189781189, "learning_rate": 9.639140153431138e-05, "loss": 0.01, "step": 16560 }, { "epoch": 0.9652239762334712, "grad_norm": 0.24988019466400146, "learning_rate": 9.638523141914721e-05, "loss": 0.011, "step": 16570 }, { "epoch": 0.9658064891943846, "grad_norm": 0.39135363698005676, "learning_rate": 9.637905623136388e-05, "loss": 0.0114, "step": 16580 }, { "epoch": 0.966389002155298, "grad_norm": 0.3414090871810913, "learning_rate": 9.637287597163669e-05, "loss": 0.0089, "step": 16590 }, { "epoch": 0.9669715151162114, "grad_norm": 0.25015079975128174, "learning_rate": 9.63666906406415e-05, "loss": 0.0095, "step": 16600 }, { "epoch": 0.9675540280771248, "grad_norm": 0.30364280939102173, "learning_rate": 9.636050023905473e-05, "loss": 0.0107, "step": 16610 }, { "epoch": 0.9681365410380381, "grad_norm": 0.19882604479789734, "learning_rate": 9.635430476755336e-05, "loss": 0.0106, "step": 16620 }, { "epoch": 0.9687190539989515, "grad_norm": 0.26910707354545593, "learning_rate": 9.63481042268149e-05, "loss": 0.0077, "step": 16630 }, { "epoch": 0.9693015669598648, "grad_norm": 0.1904679536819458, "learning_rate": 9.634189861751745e-05, "loss": 0.009, "step": 16640 }, { "epoch": 0.9698840799207782, "grad_norm": 0.3047160506248474, "learning_rate": 9.633568794033967e-05, "loss": 0.0113, "step": 16650 }, { "epoch": 0.9704665928816916, "grad_norm": 0.21404016017913818, "learning_rate": 9.63294721959607e-05, "loss": 0.0079, "step": 16660 }, { "epoch": 0.971049105842605, "grad_norm": 0.40131306648254395, "learning_rate": 9.63232513850603e-05, "loss": 0.0092, "step": 16670 }, { "epoch": 0.9716316188035183, "grad_norm": 0.2385704070329666, "learning_rate": 9.631702550831878e-05, "loss": 0.0085, "step": 16680 }, { "epoch": 0.9722141317644317, "grad_norm": 0.2843262851238251, "learning_rate": 9.631079456641698e-05, "loss": 0.0093, "step": 16690 }, { "epoch": 0.9727966447253451, "grad_norm": 0.31387999653816223, "learning_rate": 9.630455856003632e-05, "loss": 0.0134, "step": 16700 }, { "epoch": 0.9733791576862585, "grad_norm": 0.2405312955379486, "learning_rate": 9.629831748985876e-05, "loss": 0.0077, "step": 16710 }, { "epoch": 0.9739616706471719, "grad_norm": 0.2719535529613495, "learning_rate": 9.629207135656679e-05, "loss": 0.0113, "step": 16720 }, { "epoch": 0.9745441836080853, "grad_norm": 0.23348228633403778, "learning_rate": 9.628582016084353e-05, "loss": 0.0064, "step": 16730 }, { "epoch": 0.9751266965689986, "grad_norm": 0.18567639589309692, "learning_rate": 9.627956390337254e-05, "loss": 0.0065, "step": 16740 }, { "epoch": 0.975709209529912, "grad_norm": 0.19491897523403168, "learning_rate": 9.627330258483802e-05, "loss": 0.0072, "step": 16750 }, { "epoch": 0.9762917224908254, "grad_norm": 0.2925581932067871, "learning_rate": 9.62670362059247e-05, "loss": 0.0093, "step": 16760 }, { "epoch": 0.9768742354517388, "grad_norm": 0.31510162353515625, "learning_rate": 9.626076476731786e-05, "loss": 0.0117, "step": 16770 }, { "epoch": 0.9774567484126522, "grad_norm": 0.5108999609947205, "learning_rate": 9.625448826970336e-05, "loss": 0.0085, "step": 16780 }, { "epoch": 0.9780392613735656, "grad_norm": 0.4646347165107727, "learning_rate": 9.624820671376755e-05, "loss": 0.0091, "step": 16790 }, { "epoch": 0.9786217743344789, "grad_norm": 0.5064445734024048, "learning_rate": 9.62419201001974e-05, "loss": 0.0108, "step": 16800 }, { "epoch": 0.9792042872953923, "grad_norm": 0.25214001536369324, "learning_rate": 9.623562842968037e-05, "loss": 0.0114, "step": 16810 }, { "epoch": 0.9797868002563057, "grad_norm": 0.23407000303268433, "learning_rate": 9.622933170290454e-05, "loss": 0.0088, "step": 16820 }, { "epoch": 0.9803693132172191, "grad_norm": 0.3106597661972046, "learning_rate": 9.622302992055849e-05, "loss": 0.0083, "step": 16830 }, { "epoch": 0.9809518261781325, "grad_norm": 0.32557666301727295, "learning_rate": 9.62167230833314e-05, "loss": 0.0072, "step": 16840 }, { "epoch": 0.9815343391390459, "grad_norm": 0.3730427622795105, "learning_rate": 9.621041119191295e-05, "loss": 0.0078, "step": 16850 }, { "epoch": 0.9821168520999592, "grad_norm": 0.22847527265548706, "learning_rate": 9.620409424699342e-05, "loss": 0.0076, "step": 16860 }, { "epoch": 0.9826993650608726, "grad_norm": 0.3418012261390686, "learning_rate": 9.619777224926359e-05, "loss": 0.0078, "step": 16870 }, { "epoch": 0.983281878021786, "grad_norm": 0.27470433712005615, "learning_rate": 9.619144519941485e-05, "loss": 0.0118, "step": 16880 }, { "epoch": 0.9838643909826994, "grad_norm": 0.3527732789516449, "learning_rate": 9.618511309813912e-05, "loss": 0.014, "step": 16890 }, { "epoch": 0.9844469039436128, "grad_norm": 0.2035803645849228, "learning_rate": 9.617877594612886e-05, "loss": 0.0143, "step": 16900 }, { "epoch": 0.9850294169045262, "grad_norm": 0.23817038536071777, "learning_rate": 9.617243374407707e-05, "loss": 0.0078, "step": 16910 }, { "epoch": 0.9856119298654396, "grad_norm": 0.32586801052093506, "learning_rate": 9.616608649267736e-05, "loss": 0.0094, "step": 16920 }, { "epoch": 0.9861944428263529, "grad_norm": 0.2673438787460327, "learning_rate": 9.615973419262385e-05, "loss": 0.0072, "step": 16930 }, { "epoch": 0.9867769557872663, "grad_norm": 0.30563822388648987, "learning_rate": 9.615337684461119e-05, "loss": 0.01, "step": 16940 }, { "epoch": 0.9873594687481796, "grad_norm": 0.13866914808750153, "learning_rate": 9.614701444933465e-05, "loss": 0.0077, "step": 16950 }, { "epoch": 0.987941981709093, "grad_norm": 0.18126030266284943, "learning_rate": 9.614064700748997e-05, "loss": 0.0123, "step": 16960 }, { "epoch": 0.9885244946700064, "grad_norm": 0.2156129628419876, "learning_rate": 9.613427451977352e-05, "loss": 0.0099, "step": 16970 }, { "epoch": 0.9891070076309197, "grad_norm": 0.14302825927734375, "learning_rate": 9.612789698688216e-05, "loss": 0.0062, "step": 16980 }, { "epoch": 0.9896895205918331, "grad_norm": 0.26026448607444763, "learning_rate": 9.612151440951334e-05, "loss": 0.0111, "step": 16990 }, { "epoch": 0.9902720335527465, "grad_norm": 0.3039906620979309, "learning_rate": 9.611512678836506e-05, "loss": 0.0115, "step": 17000 }, { "epoch": 0.9908545465136599, "grad_norm": 0.1947077214717865, "learning_rate": 9.610873412413584e-05, "loss": 0.0082, "step": 17010 }, { "epoch": 0.9914370594745733, "grad_norm": 0.348951518535614, "learning_rate": 9.610233641752476e-05, "loss": 0.0104, "step": 17020 }, { "epoch": 0.9920195724354867, "grad_norm": 0.35439684987068176, "learning_rate": 9.609593366923151e-05, "loss": 0.0088, "step": 17030 }, { "epoch": 0.9926020853964, "grad_norm": 0.2358553558588028, "learning_rate": 9.608952587995625e-05, "loss": 0.0086, "step": 17040 }, { "epoch": 0.9931845983573134, "grad_norm": 0.24500906467437744, "learning_rate": 9.608311305039972e-05, "loss": 0.009, "step": 17050 }, { "epoch": 0.9937671113182268, "grad_norm": 0.5256273150444031, "learning_rate": 9.607669518126326e-05, "loss": 0.012, "step": 17060 }, { "epoch": 0.9943496242791402, "grad_norm": 0.30311456322669983, "learning_rate": 9.607027227324866e-05, "loss": 0.0113, "step": 17070 }, { "epoch": 0.9949321372400536, "grad_norm": 0.2836645245552063, "learning_rate": 9.606384432705837e-05, "loss": 0.0121, "step": 17080 }, { "epoch": 0.995514650200967, "grad_norm": 0.21070818603038788, "learning_rate": 9.60574113433953e-05, "loss": 0.007, "step": 17090 }, { "epoch": 0.9960971631618804, "grad_norm": 0.23422352969646454, "learning_rate": 9.6050973322963e-05, "loss": 0.0106, "step": 17100 }, { "epoch": 0.9966796761227937, "grad_norm": 0.32362690567970276, "learning_rate": 9.604453026646547e-05, "loss": 0.0111, "step": 17110 }, { "epoch": 0.9972621890837071, "grad_norm": 0.2853545546531677, "learning_rate": 9.603808217460735e-05, "loss": 0.0103, "step": 17120 }, { "epoch": 0.9978447020446205, "grad_norm": 0.2949511706829071, "learning_rate": 9.603162904809377e-05, "loss": 0.0134, "step": 17130 }, { "epoch": 0.9984272150055339, "grad_norm": 0.1534668505191803, "learning_rate": 9.602517088763045e-05, "loss": 0.0121, "step": 17140 }, { "epoch": 0.9990097279664473, "grad_norm": 0.28179115056991577, "learning_rate": 9.601870769392365e-05, "loss": 0.0099, "step": 17150 }, { "epoch": 0.9995922409273607, "grad_norm": 0.23799347877502441, "learning_rate": 9.601223946768017e-05, "loss": 0.0101, "step": 17160 }, { "epoch": 1.000174753888274, "grad_norm": 0.3252301812171936, "learning_rate": 9.600576620960734e-05, "loss": 0.0077, "step": 17170 }, { "epoch": 1.0007572668491873, "grad_norm": 0.36093583703041077, "learning_rate": 9.599928792041308e-05, "loss": 0.0101, "step": 17180 }, { "epoch": 1.0013397798101007, "grad_norm": 0.2646123170852661, "learning_rate": 9.599280460080587e-05, "loss": 0.0094, "step": 17190 }, { "epoch": 1.001922292771014, "grad_norm": 0.2717488706111908, "learning_rate": 9.59863162514947e-05, "loss": 0.01, "step": 17200 }, { "epoch": 1.0025048057319275, "grad_norm": 0.31543856859207153, "learning_rate": 9.597982287318911e-05, "loss": 0.0122, "step": 17210 }, { "epoch": 1.0030873186928408, "grad_norm": 0.3549472391605377, "learning_rate": 9.597332446659923e-05, "loss": 0.0089, "step": 17220 }, { "epoch": 1.0036698316537542, "grad_norm": 0.37390559911727905, "learning_rate": 9.59668210324357e-05, "loss": 0.0115, "step": 17230 }, { "epoch": 1.0042523446146676, "grad_norm": 0.32142946124076843, "learning_rate": 9.596031257140974e-05, "loss": 0.0093, "step": 17240 }, { "epoch": 1.004834857575581, "grad_norm": 0.2233838140964508, "learning_rate": 9.59537990842331e-05, "loss": 0.0128, "step": 17250 }, { "epoch": 1.0054173705364944, "grad_norm": 0.1996593475341797, "learning_rate": 9.594728057161806e-05, "loss": 0.0088, "step": 17260 }, { "epoch": 1.0059998834974078, "grad_norm": 0.37831416726112366, "learning_rate": 9.594075703427752e-05, "loss": 0.0105, "step": 17270 }, { "epoch": 1.0065823964583211, "grad_norm": 0.339951753616333, "learning_rate": 9.593422847292486e-05, "loss": 0.0112, "step": 17280 }, { "epoch": 1.0071649094192345, "grad_norm": 0.31171002984046936, "learning_rate": 9.592769488827402e-05, "loss": 0.0085, "step": 17290 }, { "epoch": 1.007747422380148, "grad_norm": 0.1713646799325943, "learning_rate": 9.592115628103952e-05, "loss": 0.0106, "step": 17300 }, { "epoch": 1.0083299353410613, "grad_norm": 0.2584535479545593, "learning_rate": 9.591461265193643e-05, "loss": 0.0075, "step": 17310 }, { "epoch": 1.0089124483019747, "grad_norm": 0.22997747361660004, "learning_rate": 9.590806400168032e-05, "loss": 0.0093, "step": 17320 }, { "epoch": 1.009494961262888, "grad_norm": 0.4811360836029053, "learning_rate": 9.590151033098735e-05, "loss": 0.0112, "step": 17330 }, { "epoch": 1.0100774742238015, "grad_norm": 0.27230027318000793, "learning_rate": 9.589495164057423e-05, "loss": 0.0121, "step": 17340 }, { "epoch": 1.0106599871847148, "grad_norm": 0.24612224102020264, "learning_rate": 9.58883879311582e-05, "loss": 0.0106, "step": 17350 }, { "epoch": 1.0112425001456282, "grad_norm": 0.2288588285446167, "learning_rate": 9.588181920345705e-05, "loss": 0.0116, "step": 17360 }, { "epoch": 1.0118250131065416, "grad_norm": 0.2556689977645874, "learning_rate": 9.587524545818913e-05, "loss": 0.0079, "step": 17370 }, { "epoch": 1.012407526067455, "grad_norm": 0.3269619047641754, "learning_rate": 9.586866669607335e-05, "loss": 0.0095, "step": 17380 }, { "epoch": 1.0129900390283684, "grad_norm": 0.2677159309387207, "learning_rate": 9.586208291782915e-05, "loss": 0.0081, "step": 17390 }, { "epoch": 1.0135725519892818, "grad_norm": 0.18840692937374115, "learning_rate": 9.58554941241765e-05, "loss": 0.0074, "step": 17400 }, { "epoch": 1.0141550649501951, "grad_norm": 0.2475997507572174, "learning_rate": 9.584890031583596e-05, "loss": 0.0097, "step": 17410 }, { "epoch": 1.0147375779111085, "grad_norm": 0.2897197902202606, "learning_rate": 9.584230149352861e-05, "loss": 0.0084, "step": 17420 }, { "epoch": 1.015320090872022, "grad_norm": 0.3592067360877991, "learning_rate": 9.58356976579761e-05, "loss": 0.0123, "step": 17430 }, { "epoch": 1.0159026038329353, "grad_norm": 0.43149271607398987, "learning_rate": 9.58290888099006e-05, "loss": 0.0087, "step": 17440 }, { "epoch": 1.0164851167938487, "grad_norm": 0.39486056566238403, "learning_rate": 9.582247495002486e-05, "loss": 0.0089, "step": 17450 }, { "epoch": 1.017067629754762, "grad_norm": 0.31658878922462463, "learning_rate": 9.581585607907214e-05, "loss": 0.0084, "step": 17460 }, { "epoch": 1.0176501427156754, "grad_norm": 0.41337084770202637, "learning_rate": 9.580923219776628e-05, "loss": 0.0086, "step": 17470 }, { "epoch": 1.0182326556765888, "grad_norm": 0.32175812125205994, "learning_rate": 9.580260330683167e-05, "loss": 0.0097, "step": 17480 }, { "epoch": 1.0188151686375022, "grad_norm": 0.26547762751579285, "learning_rate": 9.579596940699322e-05, "loss": 0.0092, "step": 17490 }, { "epoch": 1.0193976815984156, "grad_norm": 0.18049271404743195, "learning_rate": 9.578933049897643e-05, "loss": 0.0065, "step": 17500 }, { "epoch": 1.019980194559329, "grad_norm": 0.21994929015636444, "learning_rate": 9.578268658350728e-05, "loss": 0.0092, "step": 17510 }, { "epoch": 1.0205627075202424, "grad_norm": 0.258632093667984, "learning_rate": 9.577603766131235e-05, "loss": 0.0091, "step": 17520 }, { "epoch": 1.0211452204811557, "grad_norm": 0.3790711760520935, "learning_rate": 9.576938373311878e-05, "loss": 0.0095, "step": 17530 }, { "epoch": 1.0217277334420691, "grad_norm": 0.4214705526828766, "learning_rate": 9.576272479965421e-05, "loss": 0.0138, "step": 17540 }, { "epoch": 1.0223102464029825, "grad_norm": 0.27221059799194336, "learning_rate": 9.575606086164687e-05, "loss": 0.0119, "step": 17550 }, { "epoch": 1.022892759363896, "grad_norm": 0.30861854553222656, "learning_rate": 9.57493919198255e-05, "loss": 0.0131, "step": 17560 }, { "epoch": 1.0234752723248093, "grad_norm": 0.22383111715316772, "learning_rate": 9.57427179749194e-05, "loss": 0.0118, "step": 17570 }, { "epoch": 1.0240577852857227, "grad_norm": 0.37924861907958984, "learning_rate": 9.573603902765846e-05, "loss": 0.0115, "step": 17580 }, { "epoch": 1.024640298246636, "grad_norm": 0.22085320949554443, "learning_rate": 9.572935507877304e-05, "loss": 0.0084, "step": 17590 }, { "epoch": 1.0252228112075494, "grad_norm": 0.2940640449523926, "learning_rate": 9.57226661289941e-05, "loss": 0.0076, "step": 17600 }, { "epoch": 1.0258053241684628, "grad_norm": 0.22494004666805267, "learning_rate": 9.571597217905315e-05, "loss": 0.0088, "step": 17610 }, { "epoch": 1.0263878371293762, "grad_norm": 0.33589881658554077, "learning_rate": 9.57092732296822e-05, "loss": 0.0089, "step": 17620 }, { "epoch": 1.0269703500902896, "grad_norm": 0.22447529435157776, "learning_rate": 9.570256928161385e-05, "loss": 0.008, "step": 17630 }, { "epoch": 1.027552863051203, "grad_norm": 0.25237205624580383, "learning_rate": 9.569586033558126e-05, "loss": 0.0115, "step": 17640 }, { "epoch": 1.0281353760121164, "grad_norm": 0.28509223461151123, "learning_rate": 9.568914639231807e-05, "loss": 0.0089, "step": 17650 }, { "epoch": 1.0287178889730297, "grad_norm": 0.3627578318119049, "learning_rate": 9.568242745255852e-05, "loss": 0.009, "step": 17660 }, { "epoch": 1.0293004019339431, "grad_norm": 0.4396534562110901, "learning_rate": 9.567570351703739e-05, "loss": 0.0138, "step": 17670 }, { "epoch": 1.0298829148948565, "grad_norm": 0.271918386220932, "learning_rate": 9.566897458649001e-05, "loss": 0.0097, "step": 17680 }, { "epoch": 1.0304654278557699, "grad_norm": 0.2272304743528366, "learning_rate": 9.566224066165221e-05, "loss": 0.0086, "step": 17690 }, { "epoch": 1.0310479408166833, "grad_norm": 0.4571428596973419, "learning_rate": 9.565550174326043e-05, "loss": 0.0158, "step": 17700 }, { "epoch": 1.0316304537775967, "grad_norm": 0.2531905174255371, "learning_rate": 9.564875783205162e-05, "loss": 0.0105, "step": 17710 }, { "epoch": 1.03221296673851, "grad_norm": 0.3544732630252838, "learning_rate": 9.564200892876328e-05, "loss": 0.0102, "step": 17720 }, { "epoch": 1.0327954796994234, "grad_norm": 0.25721198320388794, "learning_rate": 9.563525503413348e-05, "loss": 0.015, "step": 17730 }, { "epoch": 1.0333779926603366, "grad_norm": 0.32790854573249817, "learning_rate": 9.562849614890079e-05, "loss": 0.0118, "step": 17740 }, { "epoch": 1.03396050562125, "grad_norm": 0.16806204617023468, "learning_rate": 9.562173227380436e-05, "loss": 0.0092, "step": 17750 }, { "epoch": 1.0345430185821634, "grad_norm": 0.3138536810874939, "learning_rate": 9.561496340958389e-05, "loss": 0.0081, "step": 17760 }, { "epoch": 1.0351255315430767, "grad_norm": 0.4085492789745331, "learning_rate": 9.560818955697959e-05, "loss": 0.0131, "step": 17770 }, { "epoch": 1.0357080445039901, "grad_norm": 0.2861352264881134, "learning_rate": 9.560141071673228e-05, "loss": 0.0112, "step": 17780 }, { "epoch": 1.0362905574649035, "grad_norm": 0.2558354437351227, "learning_rate": 9.559462688958323e-05, "loss": 0.009, "step": 17790 }, { "epoch": 1.036873070425817, "grad_norm": 0.2755398452281952, "learning_rate": 9.558783807627434e-05, "loss": 0.0109, "step": 17800 }, { "epoch": 1.0374555833867303, "grad_norm": 0.40923741459846497, "learning_rate": 9.558104427754801e-05, "loss": 0.0096, "step": 17810 }, { "epoch": 1.0380380963476437, "grad_norm": 0.2748527228832245, "learning_rate": 9.557424549414722e-05, "loss": 0.0068, "step": 17820 }, { "epoch": 1.038620609308557, "grad_norm": 0.2709219753742218, "learning_rate": 9.556744172681546e-05, "loss": 0.0099, "step": 17830 }, { "epoch": 1.0392031222694704, "grad_norm": 0.29155150055885315, "learning_rate": 9.556063297629677e-05, "loss": 0.0116, "step": 17840 }, { "epoch": 1.0397856352303838, "grad_norm": 0.27497297525405884, "learning_rate": 9.555381924333578e-05, "loss": 0.018, "step": 17850 }, { "epoch": 1.0403681481912972, "grad_norm": 0.17289681732654572, "learning_rate": 9.554700052867758e-05, "loss": 0.0088, "step": 17860 }, { "epoch": 1.0409506611522106, "grad_norm": 0.21872493624687195, "learning_rate": 9.554017683306789e-05, "loss": 0.0097, "step": 17870 }, { "epoch": 1.041533174113124, "grad_norm": 0.275732159614563, "learning_rate": 9.553334815725294e-05, "loss": 0.0092, "step": 17880 }, { "epoch": 1.0421156870740373, "grad_norm": 0.21631677448749542, "learning_rate": 9.552651450197949e-05, "loss": 0.0096, "step": 17890 }, { "epoch": 1.0426982000349507, "grad_norm": 0.30010658502578735, "learning_rate": 9.551967586799486e-05, "loss": 0.0103, "step": 17900 }, { "epoch": 1.0432807129958641, "grad_norm": 0.28429222106933594, "learning_rate": 9.551283225604692e-05, "loss": 0.008, "step": 17910 }, { "epoch": 1.0438632259567775, "grad_norm": 0.22252453863620758, "learning_rate": 9.550598366688406e-05, "loss": 0.0128, "step": 17920 }, { "epoch": 1.0444457389176909, "grad_norm": 0.18596859276294708, "learning_rate": 9.549913010125526e-05, "loss": 0.0124, "step": 17930 }, { "epoch": 1.0450282518786043, "grad_norm": 0.20552079379558563, "learning_rate": 9.549227155990999e-05, "loss": 0.0118, "step": 17940 }, { "epoch": 1.0456107648395176, "grad_norm": 0.23177172243595123, "learning_rate": 9.548540804359828e-05, "loss": 0.0101, "step": 17950 }, { "epoch": 1.046193277800431, "grad_norm": 0.26211288571357727, "learning_rate": 9.547853955307077e-05, "loss": 0.0093, "step": 17960 }, { "epoch": 1.0467757907613444, "grad_norm": 0.29848727583885193, "learning_rate": 9.547166608907853e-05, "loss": 0.0151, "step": 17970 }, { "epoch": 1.0473583037222578, "grad_norm": 0.21395806968212128, "learning_rate": 9.546478765237326e-05, "loss": 0.0097, "step": 17980 }, { "epoch": 1.0479408166831712, "grad_norm": 0.20016519725322723, "learning_rate": 9.545790424370715e-05, "loss": 0.0109, "step": 17990 }, { "epoch": 1.0485233296440846, "grad_norm": 0.2715907394886017, "learning_rate": 9.5451015863833e-05, "loss": 0.0099, "step": 18000 }, { "epoch": 1.049105842604998, "grad_norm": 0.28530988097190857, "learning_rate": 9.544412251350408e-05, "loss": 0.0115, "step": 18010 }, { "epoch": 1.0496883555659113, "grad_norm": 0.25797078013420105, "learning_rate": 9.543722419347422e-05, "loss": 0.0119, "step": 18020 }, { "epoch": 1.0502708685268247, "grad_norm": 0.44010472297668457, "learning_rate": 9.543032090449788e-05, "loss": 0.0115, "step": 18030 }, { "epoch": 1.050853381487738, "grad_norm": 0.25002428889274597, "learning_rate": 9.542341264732992e-05, "loss": 0.0102, "step": 18040 }, { "epoch": 1.0514358944486515, "grad_norm": 0.29023683071136475, "learning_rate": 9.541649942272585e-05, "loss": 0.0141, "step": 18050 }, { "epoch": 1.0520184074095649, "grad_norm": 0.20239901542663574, "learning_rate": 9.54095812314417e-05, "loss": 0.0108, "step": 18060 }, { "epoch": 1.0526009203704783, "grad_norm": 0.2440972477197647, "learning_rate": 9.540265807423401e-05, "loss": 0.0075, "step": 18070 }, { "epoch": 1.0531834333313916, "grad_norm": 0.3092884421348572, "learning_rate": 9.53957299518599e-05, "loss": 0.0098, "step": 18080 }, { "epoch": 1.053765946292305, "grad_norm": 0.3672618269920349, "learning_rate": 9.5388796865077e-05, "loss": 0.0084, "step": 18090 }, { "epoch": 1.0543484592532184, "grad_norm": 0.3708522915840149, "learning_rate": 9.538185881464353e-05, "loss": 0.0093, "step": 18100 }, { "epoch": 1.0549309722141318, "grad_norm": 0.288604736328125, "learning_rate": 9.537491580131821e-05, "loss": 0.011, "step": 18110 }, { "epoch": 1.0555134851750452, "grad_norm": 0.3233449459075928, "learning_rate": 9.53679678258603e-05, "loss": 0.0109, "step": 18120 }, { "epoch": 1.0560959981359586, "grad_norm": 0.28323617577552795, "learning_rate": 9.536101488902966e-05, "loss": 0.0089, "step": 18130 }, { "epoch": 1.056678511096872, "grad_norm": 0.41213858127593994, "learning_rate": 9.535405699158663e-05, "loss": 0.0125, "step": 18140 }, { "epoch": 1.0572610240577853, "grad_norm": 0.16546808183193207, "learning_rate": 9.53470941342921e-05, "loss": 0.007, "step": 18150 }, { "epoch": 1.0578435370186987, "grad_norm": 0.3400897979736328, "learning_rate": 9.534012631790756e-05, "loss": 0.0104, "step": 18160 }, { "epoch": 1.058426049979612, "grad_norm": 0.2054438591003418, "learning_rate": 9.533315354319494e-05, "loss": 0.0076, "step": 18170 }, { "epoch": 1.0590085629405255, "grad_norm": 0.2800319790840149, "learning_rate": 9.532617581091682e-05, "loss": 0.0105, "step": 18180 }, { "epoch": 1.0595910759014389, "grad_norm": 0.20247745513916016, "learning_rate": 9.531919312183629e-05, "loss": 0.0113, "step": 18190 }, { "epoch": 1.0601735888623522, "grad_norm": 0.2870803475379944, "learning_rate": 9.531220547671688e-05, "loss": 0.0087, "step": 18200 }, { "epoch": 1.0607561018232656, "grad_norm": 0.16415493190288544, "learning_rate": 9.530521287632285e-05, "loss": 0.0057, "step": 18210 }, { "epoch": 1.061338614784179, "grad_norm": 0.35486486554145813, "learning_rate": 9.529821532141884e-05, "loss": 0.0089, "step": 18220 }, { "epoch": 1.0619211277450924, "grad_norm": 0.2759651243686676, "learning_rate": 9.52912128127701e-05, "loss": 0.0115, "step": 18230 }, { "epoch": 1.0625036407060058, "grad_norm": 0.24264861643314362, "learning_rate": 9.528420535114244e-05, "loss": 0.0074, "step": 18240 }, { "epoch": 1.0630861536669192, "grad_norm": 0.30102604627609253, "learning_rate": 9.527719293730215e-05, "loss": 0.0107, "step": 18250 }, { "epoch": 1.0636686666278325, "grad_norm": 0.23720404505729675, "learning_rate": 9.527017557201611e-05, "loss": 0.0084, "step": 18260 }, { "epoch": 1.064251179588746, "grad_norm": 0.23721691966056824, "learning_rate": 9.526315325605176e-05, "loss": 0.0086, "step": 18270 }, { "epoch": 1.0648336925496593, "grad_norm": 0.2165217101573944, "learning_rate": 9.525612599017699e-05, "loss": 0.0068, "step": 18280 }, { "epoch": 1.0654162055105727, "grad_norm": 0.335054486989975, "learning_rate": 9.524909377516033e-05, "loss": 0.0092, "step": 18290 }, { "epoch": 1.065998718471486, "grad_norm": 0.2672024965286255, "learning_rate": 9.524205661177081e-05, "loss": 0.01, "step": 18300 }, { "epoch": 1.0665812314323995, "grad_norm": 0.5663777589797974, "learning_rate": 9.523501450077801e-05, "loss": 0.01, "step": 18310 }, { "epoch": 1.0671637443933129, "grad_norm": 0.14281752705574036, "learning_rate": 9.522796744295202e-05, "loss": 0.0058, "step": 18320 }, { "epoch": 1.067746257354226, "grad_norm": 0.31198710203170776, "learning_rate": 9.522091543906352e-05, "loss": 0.0109, "step": 18330 }, { "epoch": 1.0683287703151394, "grad_norm": 0.19071653485298157, "learning_rate": 9.521385848988369e-05, "loss": 0.0091, "step": 18340 }, { "epoch": 1.0689112832760528, "grad_norm": 0.20137734711170197, "learning_rate": 9.520679659618428e-05, "loss": 0.0081, "step": 18350 }, { "epoch": 1.0694937962369662, "grad_norm": 0.32009968161582947, "learning_rate": 9.519972975873754e-05, "loss": 0.0094, "step": 18360 }, { "epoch": 1.0700763091978795, "grad_norm": 0.32711243629455566, "learning_rate": 9.519265797831633e-05, "loss": 0.0124, "step": 18370 }, { "epoch": 1.070658822158793, "grad_norm": 0.32825329899787903, "learning_rate": 9.518558125569399e-05, "loss": 0.0081, "step": 18380 }, { "epoch": 1.0712413351197063, "grad_norm": 0.28535810112953186, "learning_rate": 9.517849959164442e-05, "loss": 0.0099, "step": 18390 }, { "epoch": 1.0718238480806197, "grad_norm": 0.42163893580436707, "learning_rate": 9.517141298694205e-05, "loss": 0.0113, "step": 18400 }, { "epoch": 1.072406361041533, "grad_norm": 0.23629902303218842, "learning_rate": 9.516432144236188e-05, "loss": 0.011, "step": 18410 }, { "epoch": 1.0729888740024465, "grad_norm": 0.2067478895187378, "learning_rate": 9.515722495867941e-05, "loss": 0.0091, "step": 18420 }, { "epoch": 1.0735713869633599, "grad_norm": 0.26856258511543274, "learning_rate": 9.515012353667072e-05, "loss": 0.0088, "step": 18430 }, { "epoch": 1.0741538999242732, "grad_norm": 0.2834338843822479, "learning_rate": 9.51430171771124e-05, "loss": 0.0079, "step": 18440 }, { "epoch": 1.0747364128851866, "grad_norm": 0.2789689600467682, "learning_rate": 9.513590588078159e-05, "loss": 0.0093, "step": 18450 }, { "epoch": 1.0753189258461, "grad_norm": 0.19820670783519745, "learning_rate": 9.512878964845597e-05, "loss": 0.011, "step": 18460 }, { "epoch": 1.0759014388070134, "grad_norm": 0.2730116844177246, "learning_rate": 9.512166848091377e-05, "loss": 0.0072, "step": 18470 }, { "epoch": 1.0764839517679268, "grad_norm": 0.28992313146591187, "learning_rate": 9.511454237893376e-05, "loss": 0.0062, "step": 18480 }, { "epoch": 1.0770664647288402, "grad_norm": 0.3286808729171753, "learning_rate": 9.51074113432952e-05, "loss": 0.0087, "step": 18490 }, { "epoch": 1.0776489776897535, "grad_norm": 0.2682126760482788, "learning_rate": 9.510027537477797e-05, "loss": 0.0088, "step": 18500 }, { "epoch": 1.078231490650667, "grad_norm": 0.2673265337944031, "learning_rate": 9.509313447416242e-05, "loss": 0.0094, "step": 18510 }, { "epoch": 1.0788140036115803, "grad_norm": 0.31656259298324585, "learning_rate": 9.508598864222949e-05, "loss": 0.012, "step": 18520 }, { "epoch": 1.0793965165724937, "grad_norm": 0.36754146218299866, "learning_rate": 9.507883787976062e-05, "loss": 0.0101, "step": 18530 }, { "epoch": 1.079979029533407, "grad_norm": 0.30121248960494995, "learning_rate": 9.507168218753781e-05, "loss": 0.0126, "step": 18540 }, { "epoch": 1.0805615424943205, "grad_norm": 0.21919572353363037, "learning_rate": 9.506452156634362e-05, "loss": 0.0079, "step": 18550 }, { "epoch": 1.0811440554552338, "grad_norm": 0.1866351217031479, "learning_rate": 9.505735601696109e-05, "loss": 0.0141, "step": 18560 }, { "epoch": 1.0817265684161472, "grad_norm": 0.3092465102672577, "learning_rate": 9.505018554017385e-05, "loss": 0.0088, "step": 18570 }, { "epoch": 1.0823090813770606, "grad_norm": 0.2343568056821823, "learning_rate": 9.504301013676604e-05, "loss": 0.008, "step": 18580 }, { "epoch": 1.082891594337974, "grad_norm": 0.3370189666748047, "learning_rate": 9.503582980752238e-05, "loss": 0.0077, "step": 18590 }, { "epoch": 1.0834741072988874, "grad_norm": 0.4772375524044037, "learning_rate": 9.502864455322809e-05, "loss": 0.0104, "step": 18600 }, { "epoch": 1.0840566202598008, "grad_norm": 0.3409198224544525, "learning_rate": 9.502145437466891e-05, "loss": 0.0106, "step": 18610 }, { "epoch": 1.0846391332207141, "grad_norm": 0.20869499444961548, "learning_rate": 9.501425927263116e-05, "loss": 0.0084, "step": 18620 }, { "epoch": 1.0852216461816275, "grad_norm": 0.21779008209705353, "learning_rate": 9.500705924790172e-05, "loss": 0.009, "step": 18630 }, { "epoch": 1.085804159142541, "grad_norm": 0.20843614637851715, "learning_rate": 9.499985430126794e-05, "loss": 0.0068, "step": 18640 }, { "epoch": 1.0863866721034543, "grad_norm": 0.24755850434303284, "learning_rate": 9.499264443351775e-05, "loss": 0.0087, "step": 18650 }, { "epoch": 1.0869691850643677, "grad_norm": 0.300339937210083, "learning_rate": 9.498542964543961e-05, "loss": 0.0073, "step": 18660 }, { "epoch": 1.087551698025281, "grad_norm": 0.1738440990447998, "learning_rate": 9.497820993782252e-05, "loss": 0.0156, "step": 18670 }, { "epoch": 1.0881342109861945, "grad_norm": 0.1997164487838745, "learning_rate": 9.497098531145601e-05, "loss": 0.009, "step": 18680 }, { "epoch": 1.0887167239471078, "grad_norm": 0.21066409349441528, "learning_rate": 9.496375576713017e-05, "loss": 0.0085, "step": 18690 }, { "epoch": 1.0892992369080212, "grad_norm": 0.1906173974275589, "learning_rate": 9.49565213056356e-05, "loss": 0.0106, "step": 18700 }, { "epoch": 1.0898817498689346, "grad_norm": 0.20719879865646362, "learning_rate": 9.494928192776342e-05, "loss": 0.009, "step": 18710 }, { "epoch": 1.090464262829848, "grad_norm": 0.21960584819316864, "learning_rate": 9.494203763430538e-05, "loss": 0.0072, "step": 18720 }, { "epoch": 1.0910467757907614, "grad_norm": 0.23618289828300476, "learning_rate": 9.493478842605366e-05, "loss": 0.0071, "step": 18730 }, { "epoch": 1.0916292887516748, "grad_norm": 0.16362279653549194, "learning_rate": 9.492753430380105e-05, "loss": 0.0078, "step": 18740 }, { "epoch": 1.0922118017125881, "grad_norm": 0.2757934629917145, "learning_rate": 9.492027526834083e-05, "loss": 0.0071, "step": 18750 }, { "epoch": 1.0927943146735015, "grad_norm": 0.17537403106689453, "learning_rate": 9.491301132046684e-05, "loss": 0.0071, "step": 18760 }, { "epoch": 1.093376827634415, "grad_norm": 0.1986529380083084, "learning_rate": 9.490574246097345e-05, "loss": 0.0074, "step": 18770 }, { "epoch": 1.0939593405953283, "grad_norm": 0.1990971714258194, "learning_rate": 9.48984686906556e-05, "loss": 0.0082, "step": 18780 }, { "epoch": 1.0945418535562417, "grad_norm": 0.20931269228458405, "learning_rate": 9.489119001030871e-05, "loss": 0.0101, "step": 18790 }, { "epoch": 1.095124366517155, "grad_norm": 0.35964852571487427, "learning_rate": 9.488390642072878e-05, "loss": 0.0085, "step": 18800 }, { "epoch": 1.0957068794780684, "grad_norm": 0.2285773903131485, "learning_rate": 9.48766179227123e-05, "loss": 0.0094, "step": 18810 }, { "epoch": 1.0962893924389818, "grad_norm": 0.24748235940933228, "learning_rate": 9.486932451705636e-05, "loss": 0.0083, "step": 18820 }, { "epoch": 1.0968719053998952, "grad_norm": 0.2821751832962036, "learning_rate": 9.486202620455857e-05, "loss": 0.008, "step": 18830 }, { "epoch": 1.0974544183608086, "grad_norm": 0.25502368807792664, "learning_rate": 9.485472298601704e-05, "loss": 0.0078, "step": 18840 }, { "epoch": 1.098036931321722, "grad_norm": 0.2335904836654663, "learning_rate": 9.484741486223043e-05, "loss": 0.0092, "step": 18850 }, { "epoch": 1.0986194442826354, "grad_norm": 0.31142884492874146, "learning_rate": 9.484010183399797e-05, "loss": 0.0071, "step": 18860 }, { "epoch": 1.0992019572435487, "grad_norm": 0.36439022421836853, "learning_rate": 9.483278390211938e-05, "loss": 0.0085, "step": 18870 }, { "epoch": 1.0997844702044621, "grad_norm": 0.3543577194213867, "learning_rate": 9.482546106739496e-05, "loss": 0.0109, "step": 18880 }, { "epoch": 1.1003669831653755, "grad_norm": 0.2169245034456253, "learning_rate": 9.48181333306255e-05, "loss": 0.01, "step": 18890 }, { "epoch": 1.100949496126289, "grad_norm": 0.3678976893424988, "learning_rate": 9.481080069261237e-05, "loss": 0.0114, "step": 18900 }, { "epoch": 1.1015320090872023, "grad_norm": 0.25162312388420105, "learning_rate": 9.480346315415745e-05, "loss": 0.0066, "step": 18910 }, { "epoch": 1.1021145220481157, "grad_norm": 0.3422697186470032, "learning_rate": 9.479612071606314e-05, "loss": 0.0109, "step": 18920 }, { "epoch": 1.102697035009029, "grad_norm": 0.2976790964603424, "learning_rate": 9.478877337913244e-05, "loss": 0.0107, "step": 18930 }, { "epoch": 1.1032795479699424, "grad_norm": 0.22186706960201263, "learning_rate": 9.478142114416881e-05, "loss": 0.009, "step": 18940 }, { "epoch": 1.1038620609308558, "grad_norm": 0.2713399827480316, "learning_rate": 9.47740640119763e-05, "loss": 0.0076, "step": 18950 }, { "epoch": 1.1044445738917692, "grad_norm": 0.28935763239860535, "learning_rate": 9.476670198335947e-05, "loss": 0.0101, "step": 18960 }, { "epoch": 1.1050270868526826, "grad_norm": 0.21723908185958862, "learning_rate": 9.47593350591234e-05, "loss": 0.0096, "step": 18970 }, { "epoch": 1.105609599813596, "grad_norm": 0.3905322849750519, "learning_rate": 9.475196324007376e-05, "loss": 0.0084, "step": 18980 }, { "epoch": 1.1061921127745094, "grad_norm": 0.2914034426212311, "learning_rate": 9.474458652701669e-05, "loss": 0.0055, "step": 18990 }, { "epoch": 1.1067746257354225, "grad_norm": 0.197305366396904, "learning_rate": 9.473720492075892e-05, "loss": 0.0073, "step": 19000 }, { "epoch": 1.107357138696336, "grad_norm": 0.25743722915649414, "learning_rate": 9.472981842210768e-05, "loss": 0.0083, "step": 19010 }, { "epoch": 1.1079396516572493, "grad_norm": 0.37712979316711426, "learning_rate": 9.472242703187074e-05, "loss": 0.0096, "step": 19020 }, { "epoch": 1.1085221646181627, "grad_norm": 0.2248496413230896, "learning_rate": 9.471503075085643e-05, "loss": 0.0112, "step": 19030 }, { "epoch": 1.109104677579076, "grad_norm": 0.27415990829467773, "learning_rate": 9.470762957987359e-05, "loss": 0.0085, "step": 19040 }, { "epoch": 1.1096871905399894, "grad_norm": 0.2521241307258606, "learning_rate": 9.470022351973158e-05, "loss": 0.0071, "step": 19050 }, { "epoch": 1.1102697035009028, "grad_norm": 0.2753424644470215, "learning_rate": 9.469281257124034e-05, "loss": 0.0072, "step": 19060 }, { "epoch": 1.1108522164618162, "grad_norm": 0.32539743185043335, "learning_rate": 9.46853967352103e-05, "loss": 0.0077, "step": 19070 }, { "epoch": 1.1114347294227296, "grad_norm": 0.2636314928531647, "learning_rate": 9.467797601245246e-05, "loss": 0.0085, "step": 19080 }, { "epoch": 1.112017242383643, "grad_norm": 0.2913241684436798, "learning_rate": 9.467055040377834e-05, "loss": 0.0082, "step": 19090 }, { "epoch": 1.1125997553445564, "grad_norm": 0.2528302073478699, "learning_rate": 9.466311990999999e-05, "loss": 0.0078, "step": 19100 }, { "epoch": 1.1131822683054697, "grad_norm": 0.35537511110305786, "learning_rate": 9.465568453193e-05, "loss": 0.0093, "step": 19110 }, { "epoch": 1.1137647812663831, "grad_norm": 0.20378899574279785, "learning_rate": 9.464824427038148e-05, "loss": 0.0068, "step": 19120 }, { "epoch": 1.1143472942272965, "grad_norm": 0.35600021481513977, "learning_rate": 9.46407991261681e-05, "loss": 0.0088, "step": 19130 }, { "epoch": 1.1149298071882099, "grad_norm": 0.3147162199020386, "learning_rate": 9.463334910010404e-05, "loss": 0.0089, "step": 19140 }, { "epoch": 1.1155123201491233, "grad_norm": 0.21466298401355743, "learning_rate": 9.462589419300403e-05, "loss": 0.009, "step": 19150 }, { "epoch": 1.1160948331100367, "grad_norm": 0.32160311937332153, "learning_rate": 9.461843440568333e-05, "loss": 0.0082, "step": 19160 }, { "epoch": 1.11667734607095, "grad_norm": 0.259195476770401, "learning_rate": 9.461096973895773e-05, "loss": 0.0126, "step": 19170 }, { "epoch": 1.1172598590318634, "grad_norm": 0.2031410187482834, "learning_rate": 9.460350019364355e-05, "loss": 0.008, "step": 19180 }, { "epoch": 1.1178423719927768, "grad_norm": 0.3398054838180542, "learning_rate": 9.459602577055764e-05, "loss": 0.009, "step": 19190 }, { "epoch": 1.1184248849536902, "grad_norm": 0.34676042199134827, "learning_rate": 9.45885464705174e-05, "loss": 0.0099, "step": 19200 }, { "epoch": 1.1190073979146036, "grad_norm": 0.32962459325790405, "learning_rate": 9.458106229434076e-05, "loss": 0.0086, "step": 19210 }, { "epoch": 1.119589910875517, "grad_norm": 0.22255957126617432, "learning_rate": 9.457357324284617e-05, "loss": 0.0076, "step": 19220 }, { "epoch": 1.1201724238364303, "grad_norm": 0.30311116576194763, "learning_rate": 9.456607931685262e-05, "loss": 0.0101, "step": 19230 }, { "epoch": 1.1207549367973437, "grad_norm": 0.1881081759929657, "learning_rate": 9.455858051717965e-05, "loss": 0.0102, "step": 19240 }, { "epoch": 1.121337449758257, "grad_norm": 0.2619400918483734, "learning_rate": 9.45510768446473e-05, "loss": 0.0099, "step": 19250 }, { "epoch": 1.1219199627191705, "grad_norm": 0.34866613149642944, "learning_rate": 9.454356830007618e-05, "loss": 0.0088, "step": 19260 }, { "epoch": 1.1225024756800839, "grad_norm": 0.25261762738227844, "learning_rate": 9.45360548842874e-05, "loss": 0.0078, "step": 19270 }, { "epoch": 1.1230849886409973, "grad_norm": 0.2600233256816864, "learning_rate": 9.452853659810261e-05, "loss": 0.0096, "step": 19280 }, { "epoch": 1.1236675016019106, "grad_norm": 0.27366212010383606, "learning_rate": 9.452101344234401e-05, "loss": 0.0103, "step": 19290 }, { "epoch": 1.124250014562824, "grad_norm": 0.2630864977836609, "learning_rate": 9.451348541783431e-05, "loss": 0.0098, "step": 19300 }, { "epoch": 1.1248325275237374, "grad_norm": 0.2510787844657898, "learning_rate": 9.450595252539678e-05, "loss": 0.007, "step": 19310 }, { "epoch": 1.1254150404846508, "grad_norm": 0.2455877959728241, "learning_rate": 9.449841476585518e-05, "loss": 0.0077, "step": 19320 }, { "epoch": 1.1259975534455642, "grad_norm": 0.2064473032951355, "learning_rate": 9.449087214003384e-05, "loss": 0.0075, "step": 19330 }, { "epoch": 1.1265800664064776, "grad_norm": 0.16705022752285004, "learning_rate": 9.448332464875765e-05, "loss": 0.0084, "step": 19340 }, { "epoch": 1.127162579367391, "grad_norm": 0.21957725286483765, "learning_rate": 9.447577229285192e-05, "loss": 0.0061, "step": 19350 }, { "epoch": 1.1277450923283043, "grad_norm": 0.22502920031547546, "learning_rate": 9.446821507314261e-05, "loss": 0.0071, "step": 19360 }, { "epoch": 1.1283276052892177, "grad_norm": 0.260475218296051, "learning_rate": 9.446065299045617e-05, "loss": 0.0077, "step": 19370 }, { "epoch": 1.128910118250131, "grad_norm": 0.3183257281780243, "learning_rate": 9.445308604561955e-05, "loss": 0.0091, "step": 19380 }, { "epoch": 1.1294926312110445, "grad_norm": 0.36027127504348755, "learning_rate": 9.444551423946028e-05, "loss": 0.0088, "step": 19390 }, { "epoch": 1.1300751441719579, "grad_norm": 0.27057379484176636, "learning_rate": 9.443793757280638e-05, "loss": 0.0084, "step": 19400 }, { "epoch": 1.1306576571328713, "grad_norm": 0.3292173743247986, "learning_rate": 9.443035604648646e-05, "loss": 0.0114, "step": 19410 }, { "epoch": 1.1312401700937846, "grad_norm": 0.23857921361923218, "learning_rate": 9.44227696613296e-05, "loss": 0.0075, "step": 19420 }, { "epoch": 1.131822683054698, "grad_norm": 0.22772014141082764, "learning_rate": 9.441517841816542e-05, "loss": 0.0076, "step": 19430 }, { "epoch": 1.1324051960156114, "grad_norm": 0.3803078532218933, "learning_rate": 9.440758231782413e-05, "loss": 0.0083, "step": 19440 }, { "epoch": 1.1329877089765248, "grad_norm": 0.31477072834968567, "learning_rate": 9.439998136113639e-05, "loss": 0.0081, "step": 19450 }, { "epoch": 1.1335702219374382, "grad_norm": 0.1379203051328659, "learning_rate": 9.439237554893344e-05, "loss": 0.0069, "step": 19460 }, { "epoch": 1.1341527348983516, "grad_norm": 0.37279781699180603, "learning_rate": 9.438476488204705e-05, "loss": 0.0112, "step": 19470 }, { "epoch": 1.134735247859265, "grad_norm": 0.22130829095840454, "learning_rate": 9.43771493613095e-05, "loss": 0.0108, "step": 19480 }, { "epoch": 1.1353177608201783, "grad_norm": 0.27936118841171265, "learning_rate": 9.436952898755362e-05, "loss": 0.0112, "step": 19490 }, { "epoch": 1.1359002737810917, "grad_norm": 0.24827229976654053, "learning_rate": 9.436190376161276e-05, "loss": 0.0069, "step": 19500 }, { "epoch": 1.136482786742005, "grad_norm": 0.30740270018577576, "learning_rate": 9.43542736843208e-05, "loss": 0.009, "step": 19510 }, { "epoch": 1.1370652997029185, "grad_norm": 0.20039187371730804, "learning_rate": 9.434663875651216e-05, "loss": 0.0068, "step": 19520 }, { "epoch": 1.1376478126638319, "grad_norm": 0.33163130283355713, "learning_rate": 9.433899897902177e-05, "loss": 0.0105, "step": 19530 }, { "epoch": 1.1382303256247452, "grad_norm": 0.21095718443393707, "learning_rate": 9.433135435268511e-05, "loss": 0.0078, "step": 19540 }, { "epoch": 1.1388128385856586, "grad_norm": 0.2037818431854248, "learning_rate": 9.432370487833819e-05, "loss": 0.0108, "step": 19550 }, { "epoch": 1.1393953515465718, "grad_norm": 0.21959306299686432, "learning_rate": 9.431605055681756e-05, "loss": 0.0075, "step": 19560 }, { "epoch": 1.1399778645074852, "grad_norm": 0.29715031385421753, "learning_rate": 9.430839138896026e-05, "loss": 0.0101, "step": 19570 }, { "epoch": 1.1405603774683986, "grad_norm": 0.48481085896492004, "learning_rate": 9.43007273756039e-05, "loss": 0.01, "step": 19580 }, { "epoch": 1.141142890429312, "grad_norm": 0.3774980902671814, "learning_rate": 9.429305851758658e-05, "loss": 0.0075, "step": 19590 }, { "epoch": 1.1417254033902253, "grad_norm": 0.2401307374238968, "learning_rate": 9.428538481574699e-05, "loss": 0.0076, "step": 19600 }, { "epoch": 1.1423079163511387, "grad_norm": 0.26697251200675964, "learning_rate": 9.42777062709243e-05, "loss": 0.0058, "step": 19610 }, { "epoch": 1.142890429312052, "grad_norm": 0.3344525992870331, "learning_rate": 9.427002288395821e-05, "loss": 0.008, "step": 19620 }, { "epoch": 1.1434729422729655, "grad_norm": 0.4280354976654053, "learning_rate": 9.426233465568898e-05, "loss": 0.0081, "step": 19630 }, { "epoch": 1.1440554552338789, "grad_norm": 0.22257982194423676, "learning_rate": 9.42546415869574e-05, "loss": 0.007, "step": 19640 }, { "epoch": 1.1446379681947922, "grad_norm": 0.28717586398124695, "learning_rate": 9.424694367860473e-05, "loss": 0.0079, "step": 19650 }, { "epoch": 1.1452204811557056, "grad_norm": 0.38477250933647156, "learning_rate": 9.423924093147284e-05, "loss": 0.0096, "step": 19660 }, { "epoch": 1.145802994116619, "grad_norm": 0.4213363826274872, "learning_rate": 9.423153334640407e-05, "loss": 0.0116, "step": 19670 }, { "epoch": 1.1463855070775324, "grad_norm": 0.33957216143608093, "learning_rate": 9.42238209242413e-05, "loss": 0.0089, "step": 19680 }, { "epoch": 1.1469680200384458, "grad_norm": 0.4019477665424347, "learning_rate": 9.421610366582798e-05, "loss": 0.0082, "step": 19690 }, { "epoch": 1.1475505329993592, "grad_norm": 0.35750430822372437, "learning_rate": 9.420838157200803e-05, "loss": 0.0135, "step": 19700 }, { "epoch": 1.1481330459602725, "grad_norm": 0.4203181862831116, "learning_rate": 9.420065464362594e-05, "loss": 0.0123, "step": 19710 }, { "epoch": 1.148715558921186, "grad_norm": 0.38519760966300964, "learning_rate": 9.419292288152673e-05, "loss": 0.0098, "step": 19720 }, { "epoch": 1.1492980718820993, "grad_norm": 0.3275643587112427, "learning_rate": 9.418518628655588e-05, "loss": 0.0095, "step": 19730 }, { "epoch": 1.1498805848430127, "grad_norm": 0.21732957661151886, "learning_rate": 9.417744485955951e-05, "loss": 0.0101, "step": 19740 }, { "epoch": 1.150463097803926, "grad_norm": 0.26531630754470825, "learning_rate": 9.41696986013842e-05, "loss": 0.0095, "step": 19750 }, { "epoch": 1.1510456107648395, "grad_norm": 0.26332056522369385, "learning_rate": 9.416194751287705e-05, "loss": 0.0088, "step": 19760 }, { "epoch": 1.1516281237257529, "grad_norm": 0.35790109634399414, "learning_rate": 9.415419159488572e-05, "loss": 0.0094, "step": 19770 }, { "epoch": 1.1522106366866662, "grad_norm": 0.266802579164505, "learning_rate": 9.414643084825837e-05, "loss": 0.0106, "step": 19780 }, { "epoch": 1.1527931496475796, "grad_norm": 0.2423689067363739, "learning_rate": 9.413866527384372e-05, "loss": 0.0105, "step": 19790 }, { "epoch": 1.153375662608493, "grad_norm": 0.21430227160453796, "learning_rate": 9.4130894872491e-05, "loss": 0.0074, "step": 19800 }, { "epoch": 1.1539581755694064, "grad_norm": 0.40093275904655457, "learning_rate": 9.412311964504998e-05, "loss": 0.0085, "step": 19810 }, { "epoch": 1.1545406885303198, "grad_norm": 0.24764542281627655, "learning_rate": 9.411533959237091e-05, "loss": 0.0063, "step": 19820 }, { "epoch": 1.1551232014912332, "grad_norm": 0.20048311352729797, "learning_rate": 9.410755471530464e-05, "loss": 0.0111, "step": 19830 }, { "epoch": 1.1557057144521465, "grad_norm": 0.2665015757083893, "learning_rate": 9.40997650147025e-05, "loss": 0.0116, "step": 19840 }, { "epoch": 1.15628822741306, "grad_norm": 0.16436393558979034, "learning_rate": 9.409197049141637e-05, "loss": 0.0107, "step": 19850 }, { "epoch": 1.1568707403739733, "grad_norm": 0.21952608227729797, "learning_rate": 9.408417114629863e-05, "loss": 0.0122, "step": 19860 }, { "epoch": 1.1574532533348867, "grad_norm": 0.2612266540527344, "learning_rate": 9.40763669802022e-05, "loss": 0.0101, "step": 19870 }, { "epoch": 1.1580357662958, "grad_norm": 0.3260873556137085, "learning_rate": 9.406855799398056e-05, "loss": 0.0082, "step": 19880 }, { "epoch": 1.1586182792567135, "grad_norm": 0.26870355010032654, "learning_rate": 9.406074418848767e-05, "loss": 0.0104, "step": 19890 }, { "epoch": 1.1592007922176268, "grad_norm": 0.18225085735321045, "learning_rate": 9.405292556457805e-05, "loss": 0.0082, "step": 19900 }, { "epoch": 1.1597833051785402, "grad_norm": 0.1977808177471161, "learning_rate": 9.404510212310671e-05, "loss": 0.0088, "step": 19910 }, { "epoch": 1.1603658181394536, "grad_norm": 0.23561054468154907, "learning_rate": 9.403727386492924e-05, "loss": 0.0078, "step": 19920 }, { "epoch": 1.160948331100367, "grad_norm": 0.2707197368144989, "learning_rate": 9.40294407909017e-05, "loss": 0.0074, "step": 19930 }, { "epoch": 1.1615308440612804, "grad_norm": 0.2827933728694916, "learning_rate": 9.40216029018807e-05, "loss": 0.0079, "step": 19940 }, { "epoch": 1.1621133570221938, "grad_norm": 0.236893430352211, "learning_rate": 9.401376019872338e-05, "loss": 0.0087, "step": 19950 }, { "epoch": 1.1626958699831071, "grad_norm": 0.3694540560245514, "learning_rate": 9.400591268228746e-05, "loss": 0.0101, "step": 19960 }, { "epoch": 1.1632783829440205, "grad_norm": 0.30204591155052185, "learning_rate": 9.399806035343106e-05, "loss": 0.0084, "step": 19970 }, { "epoch": 1.163860895904934, "grad_norm": 0.310968816280365, "learning_rate": 9.399020321301294e-05, "loss": 0.0086, "step": 19980 }, { "epoch": 1.1644434088658473, "grad_norm": 0.3812788724899292, "learning_rate": 9.398234126189234e-05, "loss": 0.0121, "step": 19990 }, { "epoch": 1.1650259218267607, "grad_norm": 0.3042176365852356, "learning_rate": 9.397447450092902e-05, "loss": 0.0133, "step": 20000 } ], "logging_steps": 10, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }