{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4035643100011725, "eval_steps": 500, "global_step": 20500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011724703951225231, "grad_norm": 27.829729080200195, "learning_rate": 9.000000000000001e-07, "loss": 1.045, "step": 10 }, { "epoch": 0.0023449407902450463, "grad_norm": 4.97297477722168, "learning_rate": 1.9000000000000002e-06, "loss": 0.9812, "step": 20 }, { "epoch": 0.0035174111853675696, "grad_norm": 2.8889613151550293, "learning_rate": 2.9e-06, "loss": 0.8604, "step": 30 }, { "epoch": 0.0046898815804900925, "grad_norm": 2.9823195934295654, "learning_rate": 3.900000000000001e-06, "loss": 0.8893, "step": 40 }, { "epoch": 0.005862351975612616, "grad_norm": 1.7034976482391357, "learning_rate": 4.9000000000000005e-06, "loss": 0.8293, "step": 50 }, { "epoch": 0.007034822370735139, "grad_norm": 1.5929549932479858, "learning_rate": 5.9e-06, "loss": 0.9068, "step": 60 }, { "epoch": 0.008207292765857662, "grad_norm": 1.4501399993896484, "learning_rate": 6.9e-06, "loss": 0.8827, "step": 70 }, { "epoch": 0.009379763160980185, "grad_norm": 1.7045843601226807, "learning_rate": 7.9e-06, "loss": 0.8106, "step": 80 }, { "epoch": 0.010552233556102708, "grad_norm": 1.1813807487487793, "learning_rate": 8.900000000000001e-06, "loss": 0.885, "step": 90 }, { "epoch": 0.011724703951225232, "grad_norm": 1.9013347625732422, "learning_rate": 9.9e-06, "loss": 0.8914, "step": 100 }, { "epoch": 0.012897174346347755, "grad_norm": 1.4923547506332397, "learning_rate": 9.999996923284847e-06, "loss": 0.7755, "step": 110 }, { "epoch": 0.014069644741470278, "grad_norm": 1.2729132175445557, "learning_rate": 9.999986287731154e-06, "loss": 0.8561, "step": 120 }, { "epoch": 0.015242115136592802, "grad_norm": 1.4345440864562988, "learning_rate": 9.999968055370938e-06, "loss": 0.8789, "step": 130 }, { "epoch": 0.016414585531715323, "grad_norm": 1.3463603258132935, "learning_rate": 9.999942226231903e-06, "loss": 0.8874, "step": 140 }, { "epoch": 0.017587055926837847, "grad_norm": 1.3227310180664062, "learning_rate": 9.99990880035329e-06, "loss": 0.8644, "step": 150 }, { "epoch": 0.01875952632196037, "grad_norm": 1.1351855993270874, "learning_rate": 9.999867777785885e-06, "loss": 0.8047, "step": 160 }, { "epoch": 0.019931996717082893, "grad_norm": 0.9660323858261108, "learning_rate": 9.999819158592019e-06, "loss": 0.7885, "step": 170 }, { "epoch": 0.021104467112205417, "grad_norm": 1.1822775602340698, "learning_rate": 9.999762942845562e-06, "loss": 0.8852, "step": 180 }, { "epoch": 0.02227693750732794, "grad_norm": 1.0962854623794556, "learning_rate": 9.999699130631924e-06, "loss": 0.809, "step": 190 }, { "epoch": 0.023449407902450464, "grad_norm": 1.2172538042068481, "learning_rate": 9.99962772204806e-06, "loss": 0.8336, "step": 200 }, { "epoch": 0.024621878297572987, "grad_norm": 1.0687609910964966, "learning_rate": 9.999548717202466e-06, "loss": 0.8244, "step": 210 }, { "epoch": 0.02579434869269551, "grad_norm": 1.0752681493759155, "learning_rate": 9.99946211621518e-06, "loss": 0.7654, "step": 220 }, { "epoch": 0.026966819087818034, "grad_norm": 1.4007142782211304, "learning_rate": 9.999367919217777e-06, "loss": 0.8389, "step": 230 }, { "epoch": 0.028139289482940557, "grad_norm": 1.0361547470092773, "learning_rate": 9.999266126353383e-06, "loss": 0.8514, "step": 240 }, { "epoch": 0.02931175987806308, "grad_norm": 0.9558302760124207, "learning_rate": 9.999156737776655e-06, "loss": 0.8362, "step": 250 }, { "epoch": 0.030484230273185604, "grad_norm": 1.0666389465332031, "learning_rate": 9.999039753653793e-06, "loss": 0.813, "step": 260 }, { "epoch": 0.03165670066830813, "grad_norm": 1.1411157846450806, "learning_rate": 9.998915174162538e-06, "loss": 0.844, "step": 270 }, { "epoch": 0.03282917106343065, "grad_norm": 1.051877737045288, "learning_rate": 9.998782999492177e-06, "loss": 0.7511, "step": 280 }, { "epoch": 0.034001641458553174, "grad_norm": 1.1638388633728027, "learning_rate": 9.998643229843526e-06, "loss": 0.785, "step": 290 }, { "epoch": 0.035174111853675694, "grad_norm": 1.1727697849273682, "learning_rate": 9.99849586542895e-06, "loss": 0.8591, "step": 300 }, { "epoch": 0.03634658224879822, "grad_norm": 1.2598038911819458, "learning_rate": 9.998340906472346e-06, "loss": 0.842, "step": 310 }, { "epoch": 0.03751905264392074, "grad_norm": 1.220785140991211, "learning_rate": 9.998178353209156e-06, "loss": 0.8295, "step": 320 }, { "epoch": 0.03869152303904327, "grad_norm": 1.1124545335769653, "learning_rate": 9.998008205886359e-06, "loss": 0.856, "step": 330 }, { "epoch": 0.03986399343416579, "grad_norm": 1.1316536664962769, "learning_rate": 9.997830464762467e-06, "loss": 0.8176, "step": 340 }, { "epoch": 0.041036463829288314, "grad_norm": 1.122061848640442, "learning_rate": 9.997645130107536e-06, "loss": 0.8054, "step": 350 }, { "epoch": 0.042208934224410834, "grad_norm": 1.0631396770477295, "learning_rate": 9.997452202203155e-06, "loss": 0.8491, "step": 360 }, { "epoch": 0.043381404619533354, "grad_norm": 1.117306113243103, "learning_rate": 9.997251681342457e-06, "loss": 0.9264, "step": 370 }, { "epoch": 0.04455387501465588, "grad_norm": 1.1257014274597168, "learning_rate": 9.997043567830102e-06, "loss": 0.7506, "step": 380 }, { "epoch": 0.0457263454097784, "grad_norm": 1.0716959238052368, "learning_rate": 9.99682786198229e-06, "loss": 0.8748, "step": 390 }, { "epoch": 0.04689881580490093, "grad_norm": 1.1161850690841675, "learning_rate": 9.996604564126761e-06, "loss": 0.8361, "step": 400 }, { "epoch": 0.04807128620002345, "grad_norm": 1.117165446281433, "learning_rate": 9.996373674602783e-06, "loss": 0.8402, "step": 410 }, { "epoch": 0.049243756595145974, "grad_norm": 1.0964220762252808, "learning_rate": 9.996135193761163e-06, "loss": 0.7945, "step": 420 }, { "epoch": 0.050416226990268494, "grad_norm": 1.2352114915847778, "learning_rate": 9.995889121964239e-06, "loss": 0.8238, "step": 430 }, { "epoch": 0.05158869738539102, "grad_norm": 1.1692341566085815, "learning_rate": 9.995635459585885e-06, "loss": 0.7949, "step": 440 }, { "epoch": 0.05276116778051354, "grad_norm": 0.8931179046630859, "learning_rate": 9.995374207011506e-06, "loss": 0.8815, "step": 450 }, { "epoch": 0.05393363817563607, "grad_norm": 1.156550645828247, "learning_rate": 9.995105364638042e-06, "loss": 0.8265, "step": 460 }, { "epoch": 0.05510610857075859, "grad_norm": 0.9915204048156738, "learning_rate": 9.994828932873961e-06, "loss": 0.8195, "step": 470 }, { "epoch": 0.056278578965881114, "grad_norm": 1.277901530265808, "learning_rate": 9.994544912139265e-06, "loss": 0.8522, "step": 480 }, { "epoch": 0.057451049361003634, "grad_norm": 1.1370762586593628, "learning_rate": 9.994253302865485e-06, "loss": 0.8178, "step": 490 }, { "epoch": 0.05862351975612616, "grad_norm": 1.1734304428100586, "learning_rate": 9.993954105495682e-06, "loss": 0.8252, "step": 500 }, { "epoch": 0.05862351975612616, "eval_loss": 0.8224806189537048, "eval_model_preparation_time": 0.0, "eval_runtime": 2156.322, "eval_samples_per_second": 3.516, "eval_steps_per_second": 1.758, "step": 500 }, { "epoch": 0.05979599015124868, "grad_norm": 1.054801344871521, "learning_rate": 9.993647320484445e-06, "loss": 0.8333, "step": 510 }, { "epoch": 0.06096846054637121, "grad_norm": 1.0117371082305908, "learning_rate": 9.993332948297897e-06, "loss": 0.8002, "step": 520 }, { "epoch": 0.06214093094149373, "grad_norm": 1.035262107849121, "learning_rate": 9.993010989413676e-06, "loss": 0.7876, "step": 530 }, { "epoch": 0.06331340133661625, "grad_norm": 1.0231842994689941, "learning_rate": 9.992681444320965e-06, "loss": 0.8074, "step": 540 }, { "epoch": 0.06448587173173877, "grad_norm": 0.9932712912559509, "learning_rate": 9.992344313520454e-06, "loss": 0.8446, "step": 550 }, { "epoch": 0.0656583421268613, "grad_norm": 1.143707036972046, "learning_rate": 9.991999597524376e-06, "loss": 0.8755, "step": 560 }, { "epoch": 0.06683081252198382, "grad_norm": 1.1646944284439087, "learning_rate": 9.991647296856475e-06, "loss": 0.8484, "step": 570 }, { "epoch": 0.06800328291710635, "grad_norm": 1.0332087278366089, "learning_rate": 9.991287412052026e-06, "loss": 0.7562, "step": 580 }, { "epoch": 0.06917575331222886, "grad_norm": 1.1251457929611206, "learning_rate": 9.990919943657829e-06, "loss": 0.8004, "step": 590 }, { "epoch": 0.07034822370735139, "grad_norm": 1.0979716777801514, "learning_rate": 9.990544892232196e-06, "loss": 0.8212, "step": 600 }, { "epoch": 0.07152069410247391, "grad_norm": 1.1119449138641357, "learning_rate": 9.990162258344973e-06, "loss": 0.8041, "step": 610 }, { "epoch": 0.07269316449759644, "grad_norm": 1.0454171895980835, "learning_rate": 9.98977204257752e-06, "loss": 0.7849, "step": 620 }, { "epoch": 0.07386563489271895, "grad_norm": 1.1640774011611938, "learning_rate": 9.989374245522713e-06, "loss": 0.9081, "step": 630 }, { "epoch": 0.07503810528784148, "grad_norm": 1.1179234981536865, "learning_rate": 9.988968867784958e-06, "loss": 0.8351, "step": 640 }, { "epoch": 0.07621057568296401, "grad_norm": 0.9342063069343567, "learning_rate": 9.988555909980165e-06, "loss": 0.7969, "step": 650 }, { "epoch": 0.07738304607808653, "grad_norm": 1.0664631128311157, "learning_rate": 9.988135372735772e-06, "loss": 0.8298, "step": 660 }, { "epoch": 0.07855551647320905, "grad_norm": 1.1320250034332275, "learning_rate": 9.98770725669073e-06, "loss": 0.8193, "step": 670 }, { "epoch": 0.07972798686833157, "grad_norm": 1.0470677614212036, "learning_rate": 9.987271562495497e-06, "loss": 0.8478, "step": 680 }, { "epoch": 0.0809004572634541, "grad_norm": 1.3347469568252563, "learning_rate": 9.98682829081206e-06, "loss": 0.746, "step": 690 }, { "epoch": 0.08207292765857663, "grad_norm": 1.1938596963882446, "learning_rate": 9.986377442313905e-06, "loss": 0.8543, "step": 700 }, { "epoch": 0.08324539805369914, "grad_norm": 1.0010122060775757, "learning_rate": 9.985919017686038e-06, "loss": 0.8074, "step": 710 }, { "epoch": 0.08441786844882167, "grad_norm": 0.9762783646583557, "learning_rate": 9.985453017624973e-06, "loss": 0.832, "step": 720 }, { "epoch": 0.0855903388439442, "grad_norm": 1.0442547798156738, "learning_rate": 9.984979442838734e-06, "loss": 0.8131, "step": 730 }, { "epoch": 0.08676280923906671, "grad_norm": 1.013124942779541, "learning_rate": 9.984498294046856e-06, "loss": 0.836, "step": 740 }, { "epoch": 0.08793527963418923, "grad_norm": 2.9621942043304443, "learning_rate": 9.984009571980378e-06, "loss": 0.8201, "step": 750 }, { "epoch": 0.08910775002931176, "grad_norm": 1.3046759366989136, "learning_rate": 9.983513277381846e-06, "loss": 0.7715, "step": 760 }, { "epoch": 0.09028022042443429, "grad_norm": 0.9856771230697632, "learning_rate": 9.983009411005316e-06, "loss": 0.797, "step": 770 }, { "epoch": 0.0914526908195568, "grad_norm": 1.1945509910583496, "learning_rate": 9.982497973616346e-06, "loss": 0.8293, "step": 780 }, { "epoch": 0.09262516121467933, "grad_norm": 1.1303714513778687, "learning_rate": 9.981978965991993e-06, "loss": 0.754, "step": 790 }, { "epoch": 0.09379763160980185, "grad_norm": 0.9309568405151367, "learning_rate": 9.981452388920819e-06, "loss": 0.8208, "step": 800 }, { "epoch": 0.09497010200492438, "grad_norm": 1.0520724058151245, "learning_rate": 9.98091824320289e-06, "loss": 0.8431, "step": 810 }, { "epoch": 0.0961425724000469, "grad_norm": 1.0934245586395264, "learning_rate": 9.980376529649768e-06, "loss": 0.8317, "step": 820 }, { "epoch": 0.09731504279516942, "grad_norm": 0.9990631937980652, "learning_rate": 9.979827249084512e-06, "loss": 0.8227, "step": 830 }, { "epoch": 0.09848751319029195, "grad_norm": 1.1128180027008057, "learning_rate": 9.97927040234168e-06, "loss": 0.8068, "step": 840 }, { "epoch": 0.09965998358541447, "grad_norm": 0.9136407971382141, "learning_rate": 9.978705990267328e-06, "loss": 0.804, "step": 850 }, { "epoch": 0.10083245398053699, "grad_norm": 1.0382263660430908, "learning_rate": 9.978134013719003e-06, "loss": 0.8302, "step": 860 }, { "epoch": 0.10200492437565951, "grad_norm": 0.9846731424331665, "learning_rate": 9.977554473565745e-06, "loss": 0.7653, "step": 870 }, { "epoch": 0.10317739477078204, "grad_norm": 1.0712236166000366, "learning_rate": 9.97696737068809e-06, "loss": 0.8412, "step": 880 }, { "epoch": 0.10434986516590457, "grad_norm": 1.0287938117980957, "learning_rate": 9.97637270597806e-06, "loss": 0.809, "step": 890 }, { "epoch": 0.10552233556102708, "grad_norm": 1.1063660383224487, "learning_rate": 9.975770480339169e-06, "loss": 0.8011, "step": 900 }, { "epoch": 0.10669480595614961, "grad_norm": 0.9746326208114624, "learning_rate": 9.975160694686416e-06, "loss": 0.7469, "step": 910 }, { "epoch": 0.10786727635127213, "grad_norm": 1.1849004030227661, "learning_rate": 9.97454334994629e-06, "loss": 0.8738, "step": 920 }, { "epoch": 0.10903974674639465, "grad_norm": 1.126932978630066, "learning_rate": 9.973918447056763e-06, "loss": 0.7726, "step": 930 }, { "epoch": 0.11021221714151717, "grad_norm": 1.0914807319641113, "learning_rate": 9.973285986967292e-06, "loss": 0.8243, "step": 940 }, { "epoch": 0.1113846875366397, "grad_norm": 1.1280622482299805, "learning_rate": 9.972645970638812e-06, "loss": 0.8366, "step": 950 }, { "epoch": 0.11255715793176223, "grad_norm": 0.9570925831794739, "learning_rate": 9.971998399043744e-06, "loss": 0.7974, "step": 960 }, { "epoch": 0.11372962832688474, "grad_norm": 1.1555655002593994, "learning_rate": 9.971343273165986e-06, "loss": 0.7964, "step": 970 }, { "epoch": 0.11490209872200727, "grad_norm": 1.0856919288635254, "learning_rate": 9.970680594000912e-06, "loss": 0.8162, "step": 980 }, { "epoch": 0.1160745691171298, "grad_norm": 1.0501084327697754, "learning_rate": 9.970010362555375e-06, "loss": 0.7632, "step": 990 }, { "epoch": 0.11724703951225232, "grad_norm": 0.9654927849769592, "learning_rate": 9.969332579847702e-06, "loss": 0.7794, "step": 1000 }, { "epoch": 0.11724703951225232, "eval_loss": 0.8109387755393982, "eval_model_preparation_time": 0.0, "eval_runtime": 2151.4279, "eval_samples_per_second": 3.524, "eval_steps_per_second": 1.762, "step": 1000 }, { "epoch": 0.11841950990737483, "grad_norm": 1.060977816581726, "learning_rate": 9.968647246907692e-06, "loss": 0.8687, "step": 1010 }, { "epoch": 0.11959198030249736, "grad_norm": 1.0419484376907349, "learning_rate": 9.967954364776615e-06, "loss": 0.8189, "step": 1020 }, { "epoch": 0.12076445069761989, "grad_norm": 1.2356235980987549, "learning_rate": 9.967253934507215e-06, "loss": 0.7552, "step": 1030 }, { "epoch": 0.12193692109274241, "grad_norm": 1.1029407978057861, "learning_rate": 9.966545957163698e-06, "loss": 0.8423, "step": 1040 }, { "epoch": 0.12310939148786493, "grad_norm": 1.0124760866165161, "learning_rate": 9.965830433821743e-06, "loss": 0.8365, "step": 1050 }, { "epoch": 0.12428186188298745, "grad_norm": 2.2652392387390137, "learning_rate": 9.965107365568491e-06, "loss": 0.8382, "step": 1060 }, { "epoch": 0.12545433227810998, "grad_norm": 0.9552695751190186, "learning_rate": 9.964376753502548e-06, "loss": 0.8016, "step": 1070 }, { "epoch": 0.1266268026732325, "grad_norm": 1.0210859775543213, "learning_rate": 9.96363859873398e-06, "loss": 0.8054, "step": 1080 }, { "epoch": 0.12779927306835503, "grad_norm": 1.0651522874832153, "learning_rate": 9.962892902384311e-06, "loss": 0.8199, "step": 1090 }, { "epoch": 0.12897174346347753, "grad_norm": 1.0713568925857544, "learning_rate": 9.96213966558653e-06, "loss": 0.8059, "step": 1100 }, { "epoch": 0.13014421385860006, "grad_norm": 1.183523178100586, "learning_rate": 9.961378889485075e-06, "loss": 0.8206, "step": 1110 }, { "epoch": 0.1313166842537226, "grad_norm": 1.1169764995574951, "learning_rate": 9.960610575235848e-06, "loss": 0.8043, "step": 1120 }, { "epoch": 0.13248915464884511, "grad_norm": 1.0489065647125244, "learning_rate": 9.959834724006196e-06, "loss": 0.7755, "step": 1130 }, { "epoch": 0.13366162504396764, "grad_norm": 1.0494054555892944, "learning_rate": 9.959051336974923e-06, "loss": 0.8717, "step": 1140 }, { "epoch": 0.13483409543909017, "grad_norm": 1.0527082681655884, "learning_rate": 9.958260415332277e-06, "loss": 0.8783, "step": 1150 }, { "epoch": 0.1360065658342127, "grad_norm": 1.0578163862228394, "learning_rate": 9.957461960279957e-06, "loss": 0.806, "step": 1160 }, { "epoch": 0.13717903622933522, "grad_norm": 1.0222848653793335, "learning_rate": 9.95665597303111e-06, "loss": 0.7796, "step": 1170 }, { "epoch": 0.13835150662445772, "grad_norm": 1.062174677848816, "learning_rate": 9.955842454810326e-06, "loss": 0.8416, "step": 1180 }, { "epoch": 0.13952397701958025, "grad_norm": 1.0894064903259277, "learning_rate": 9.955021406853636e-06, "loss": 0.84, "step": 1190 }, { "epoch": 0.14069644741470277, "grad_norm": 1.042649269104004, "learning_rate": 9.95419283040851e-06, "loss": 0.8277, "step": 1200 }, { "epoch": 0.1418689178098253, "grad_norm": 1.0076345205307007, "learning_rate": 9.953356726733859e-06, "loss": 0.8849, "step": 1210 }, { "epoch": 0.14304138820494783, "grad_norm": 1.1147191524505615, "learning_rate": 9.952513097100032e-06, "loss": 0.9182, "step": 1220 }, { "epoch": 0.14421385860007035, "grad_norm": 0.9833148717880249, "learning_rate": 9.951661942788807e-06, "loss": 0.8413, "step": 1230 }, { "epoch": 0.14538632899519288, "grad_norm": 1.0901368856430054, "learning_rate": 9.950803265093402e-06, "loss": 0.8162, "step": 1240 }, { "epoch": 0.14655879939031538, "grad_norm": 1.131896734237671, "learning_rate": 9.949937065318462e-06, "loss": 0.8139, "step": 1250 }, { "epoch": 0.1477312697854379, "grad_norm": 1.016331672668457, "learning_rate": 9.949063344780058e-06, "loss": 0.8469, "step": 1260 }, { "epoch": 0.14890374018056043, "grad_norm": 0.9437591433525085, "learning_rate": 9.948182104805695e-06, "loss": 0.7569, "step": 1270 }, { "epoch": 0.15007621057568296, "grad_norm": 0.9311314821243286, "learning_rate": 9.947293346734297e-06, "loss": 0.7775, "step": 1280 }, { "epoch": 0.1512486809708055, "grad_norm": 0.9967880845069885, "learning_rate": 9.94639707191621e-06, "loss": 0.8114, "step": 1290 }, { "epoch": 0.15242115136592801, "grad_norm": 1.004489779472351, "learning_rate": 9.945493281713205e-06, "loss": 0.7563, "step": 1300 }, { "epoch": 0.15359362176105054, "grad_norm": 1.085823655128479, "learning_rate": 9.94458197749847e-06, "loss": 0.7679, "step": 1310 }, { "epoch": 0.15476609215617307, "grad_norm": 1.081066608428955, "learning_rate": 9.943663160656608e-06, "loss": 0.7789, "step": 1320 }, { "epoch": 0.15593856255129557, "grad_norm": 1.0339267253875732, "learning_rate": 9.94273683258364e-06, "loss": 0.8075, "step": 1330 }, { "epoch": 0.1571110329464181, "grad_norm": 1.600469708442688, "learning_rate": 9.941802994686993e-06, "loss": 0.8597, "step": 1340 }, { "epoch": 0.15828350334154062, "grad_norm": 1.195694088935852, "learning_rate": 9.940861648385512e-06, "loss": 0.7882, "step": 1350 }, { "epoch": 0.15945597373666315, "grad_norm": 1.0695439577102661, "learning_rate": 9.939912795109442e-06, "loss": 0.8067, "step": 1360 }, { "epoch": 0.16062844413178567, "grad_norm": 1.2086544036865234, "learning_rate": 9.938956436300443e-06, "loss": 0.7977, "step": 1370 }, { "epoch": 0.1618009145269082, "grad_norm": 1.004260778427124, "learning_rate": 9.937992573411568e-06, "loss": 0.7535, "step": 1380 }, { "epoch": 0.16297338492203073, "grad_norm": 0.9720966219902039, "learning_rate": 9.93702120790728e-06, "loss": 0.7976, "step": 1390 }, { "epoch": 0.16414585531715326, "grad_norm": 0.9096723794937134, "learning_rate": 9.93604234126344e-06, "loss": 0.8453, "step": 1400 }, { "epoch": 0.16531832571227575, "grad_norm": 1.1907901763916016, "learning_rate": 9.935055974967299e-06, "loss": 0.7862, "step": 1410 }, { "epoch": 0.16649079610739828, "grad_norm": 0.9642701745033264, "learning_rate": 9.934062110517513e-06, "loss": 0.7747, "step": 1420 }, { "epoch": 0.1676632665025208, "grad_norm": 1.0275979042053223, "learning_rate": 9.933060749424123e-06, "loss": 0.8533, "step": 1430 }, { "epoch": 0.16883573689764333, "grad_norm": 1.2439024448394775, "learning_rate": 9.932051893208562e-06, "loss": 0.7965, "step": 1440 }, { "epoch": 0.17000820729276586, "grad_norm": 1.05134916305542, "learning_rate": 9.931035543403653e-06, "loss": 0.8159, "step": 1450 }, { "epoch": 0.1711806776878884, "grad_norm": 1.1327779293060303, "learning_rate": 9.930011701553602e-06, "loss": 0.7912, "step": 1460 }, { "epoch": 0.17235314808301092, "grad_norm": 1.1282494068145752, "learning_rate": 9.928980369213997e-06, "loss": 0.8075, "step": 1470 }, { "epoch": 0.17352561847813341, "grad_norm": 1.5871269702911377, "learning_rate": 9.927941547951811e-06, "loss": 0.8047, "step": 1480 }, { "epoch": 0.17469808887325594, "grad_norm": 1.4445865154266357, "learning_rate": 9.926895239345395e-06, "loss": 0.7924, "step": 1490 }, { "epoch": 0.17587055926837847, "grad_norm": 1.0465383529663086, "learning_rate": 9.925841444984469e-06, "loss": 0.8285, "step": 1500 }, { "epoch": 0.17587055926837847, "eval_loss": 0.801771879196167, "eval_model_preparation_time": 0.0, "eval_runtime": 2150.7408, "eval_samples_per_second": 3.525, "eval_steps_per_second": 1.763, "step": 1500 }, { "epoch": 0.177043029663501, "grad_norm": 1.0891391038894653, "learning_rate": 9.924780166470135e-06, "loss": 0.7771, "step": 1510 }, { "epoch": 0.17821550005862352, "grad_norm": 1.0927152633666992, "learning_rate": 9.923711405414866e-06, "loss": 0.8453, "step": 1520 }, { "epoch": 0.17938797045374605, "grad_norm": 1.0403251647949219, "learning_rate": 9.922635163442493e-06, "loss": 0.782, "step": 1530 }, { "epoch": 0.18056044084886858, "grad_norm": 1.0661215782165527, "learning_rate": 9.921551442188228e-06, "loss": 0.8022, "step": 1540 }, { "epoch": 0.1817329112439911, "grad_norm": 1.01125967502594, "learning_rate": 9.920460243298635e-06, "loss": 0.7982, "step": 1550 }, { "epoch": 0.1829053816391136, "grad_norm": 1.0791785717010498, "learning_rate": 9.919361568431646e-06, "loss": 0.8328, "step": 1560 }, { "epoch": 0.18407785203423613, "grad_norm": 1.1360079050064087, "learning_rate": 9.91825541925655e-06, "loss": 0.8075, "step": 1570 }, { "epoch": 0.18525032242935865, "grad_norm": 0.957137405872345, "learning_rate": 9.91714179745399e-06, "loss": 0.8147, "step": 1580 }, { "epoch": 0.18642279282448118, "grad_norm": 1.0720385313034058, "learning_rate": 9.916020704715967e-06, "loss": 0.788, "step": 1590 }, { "epoch": 0.1875952632196037, "grad_norm": 1.0667275190353394, "learning_rate": 9.91489214274583e-06, "loss": 0.8066, "step": 1600 }, { "epoch": 0.18876773361472624, "grad_norm": 1.0418856143951416, "learning_rate": 9.913756113258274e-06, "loss": 0.8297, "step": 1610 }, { "epoch": 0.18994020400984876, "grad_norm": 1.0087631940841675, "learning_rate": 9.912612617979346e-06, "loss": 0.8347, "step": 1620 }, { "epoch": 0.19111267440497126, "grad_norm": 0.9345664381980896, "learning_rate": 9.911461658646435e-06, "loss": 0.7748, "step": 1630 }, { "epoch": 0.1922851448000938, "grad_norm": 1.037280797958374, "learning_rate": 9.910303237008262e-06, "loss": 0.7909, "step": 1640 }, { "epoch": 0.19345761519521631, "grad_norm": 1.066650390625, "learning_rate": 9.909137354824901e-06, "loss": 0.8238, "step": 1650 }, { "epoch": 0.19463008559033884, "grad_norm": 1.0236560106277466, "learning_rate": 9.90796401386775e-06, "loss": 0.7585, "step": 1660 }, { "epoch": 0.19580255598546137, "grad_norm": 0.9707803726196289, "learning_rate": 9.90678321591954e-06, "loss": 0.8236, "step": 1670 }, { "epoch": 0.1969750263805839, "grad_norm": 1.2035491466522217, "learning_rate": 9.905594962774337e-06, "loss": 0.7947, "step": 1680 }, { "epoch": 0.19814749677570642, "grad_norm": 1.0464321374893188, "learning_rate": 9.904399256237531e-06, "loss": 0.7998, "step": 1690 }, { "epoch": 0.19931996717082895, "grad_norm": 0.8360079526901245, "learning_rate": 9.90319609812584e-06, "loss": 0.8361, "step": 1700 }, { "epoch": 0.20049243756595145, "grad_norm": 1.0119160413742065, "learning_rate": 9.901985490267298e-06, "loss": 0.7462, "step": 1710 }, { "epoch": 0.20166490796107397, "grad_norm": 0.9922940731048584, "learning_rate": 9.900767434501261e-06, "loss": 0.7766, "step": 1720 }, { "epoch": 0.2028373783561965, "grad_norm": 1.0462210178375244, "learning_rate": 9.8995419326784e-06, "loss": 0.7869, "step": 1730 }, { "epoch": 0.20400984875131903, "grad_norm": 1.0026392936706543, "learning_rate": 9.898308986660704e-06, "loss": 0.7834, "step": 1740 }, { "epoch": 0.20518231914644156, "grad_norm": 1.0425629615783691, "learning_rate": 9.897068598321465e-06, "loss": 0.7754, "step": 1750 }, { "epoch": 0.20635478954156408, "grad_norm": 1.0533638000488281, "learning_rate": 9.895820769545288e-06, "loss": 0.7919, "step": 1760 }, { "epoch": 0.2075272599366866, "grad_norm": 1.0982707738876343, "learning_rate": 9.89456550222808e-06, "loss": 0.8096, "step": 1770 }, { "epoch": 0.20869973033180914, "grad_norm": 1.0655314922332764, "learning_rate": 9.893302798277051e-06, "loss": 0.8031, "step": 1780 }, { "epoch": 0.20987220072693163, "grad_norm": 0.9073206186294556, "learning_rate": 9.89203265961071e-06, "loss": 0.7727, "step": 1790 }, { "epoch": 0.21104467112205416, "grad_norm": 0.9635109305381775, "learning_rate": 9.890755088158861e-06, "loss": 0.7944, "step": 1800 }, { "epoch": 0.2122171415171767, "grad_norm": 1.0303761959075928, "learning_rate": 9.889470085862604e-06, "loss": 0.8049, "step": 1810 }, { "epoch": 0.21338961191229922, "grad_norm": 1.2308783531188965, "learning_rate": 9.888177654674325e-06, "loss": 0.7956, "step": 1820 }, { "epoch": 0.21456208230742174, "grad_norm": 1.0561060905456543, "learning_rate": 9.8868777965577e-06, "loss": 0.7657, "step": 1830 }, { "epoch": 0.21573455270254427, "grad_norm": 1.0234789848327637, "learning_rate": 9.88557051348769e-06, "loss": 0.7877, "step": 1840 }, { "epoch": 0.2169070230976668, "grad_norm": 0.9951615929603577, "learning_rate": 9.884255807450533e-06, "loss": 0.854, "step": 1850 }, { "epoch": 0.2180794934927893, "grad_norm": 1.141144871711731, "learning_rate": 9.882933680443749e-06, "loss": 0.7979, "step": 1860 }, { "epoch": 0.21925196388791182, "grad_norm": 1.0474966764450073, "learning_rate": 9.881604134476135e-06, "loss": 0.7597, "step": 1870 }, { "epoch": 0.22042443428303435, "grad_norm": 0.9398805499076843, "learning_rate": 9.880267171567752e-06, "loss": 0.7084, "step": 1880 }, { "epoch": 0.22159690467815688, "grad_norm": 1.3078765869140625, "learning_rate": 9.87892279374994e-06, "loss": 0.8453, "step": 1890 }, { "epoch": 0.2227693750732794, "grad_norm": 1.030839204788208, "learning_rate": 9.877571003065298e-06, "loss": 0.7597, "step": 1900 }, { "epoch": 0.22394184546840193, "grad_norm": 1.2291853427886963, "learning_rate": 9.87621180156769e-06, "loss": 0.8125, "step": 1910 }, { "epoch": 0.22511431586352446, "grad_norm": 1.016487956047058, "learning_rate": 9.87484519132224e-06, "loss": 0.7264, "step": 1920 }, { "epoch": 0.22628678625864698, "grad_norm": 0.9034894108772278, "learning_rate": 9.873471174405328e-06, "loss": 0.7936, "step": 1930 }, { "epoch": 0.22745925665376948, "grad_norm": 1.1107020378112793, "learning_rate": 9.87208975290459e-06, "loss": 0.8116, "step": 1940 }, { "epoch": 0.228631727048892, "grad_norm": 0.9209761023521423, "learning_rate": 9.870700928918907e-06, "loss": 0.8806, "step": 1950 }, { "epoch": 0.22980419744401454, "grad_norm": 1.011099100112915, "learning_rate": 9.869304704558412e-06, "loss": 0.8008, "step": 1960 }, { "epoch": 0.23097666783913706, "grad_norm": 1.164881944656372, "learning_rate": 9.867901081944482e-06, "loss": 0.8239, "step": 1970 }, { "epoch": 0.2321491382342596, "grad_norm": 0.9587607979774475, "learning_rate": 9.866490063209727e-06, "loss": 0.7821, "step": 1980 }, { "epoch": 0.23332160862938212, "grad_norm": 1.0871880054473877, "learning_rate": 9.865071650498007e-06, "loss": 0.8479, "step": 1990 }, { "epoch": 0.23449407902450464, "grad_norm": 1.1691548824310303, "learning_rate": 9.863645845964405e-06, "loss": 0.7421, "step": 2000 }, { "epoch": 0.23449407902450464, "eval_loss": 0.7943703532218933, "eval_model_preparation_time": 0.0, "eval_runtime": 2144.3804, "eval_samples_per_second": 3.536, "eval_steps_per_second": 1.768, "step": 2000 }, { "epoch": 0.23566654941962714, "grad_norm": 1.0589951276779175, "learning_rate": 9.86221265177524e-06, "loss": 0.8206, "step": 2010 }, { "epoch": 0.23683901981474967, "grad_norm": 1.1032077074050903, "learning_rate": 9.860772070108057e-06, "loss": 0.8041, "step": 2020 }, { "epoch": 0.2380114902098722, "grad_norm": 1.021894097328186, "learning_rate": 9.85932410315163e-06, "loss": 0.7483, "step": 2030 }, { "epoch": 0.23918396060499472, "grad_norm": 1.2132017612457275, "learning_rate": 9.857868753105947e-06, "loss": 0.7942, "step": 2040 }, { "epoch": 0.24035643100011725, "grad_norm": 1.1714890003204346, "learning_rate": 9.856406022182211e-06, "loss": 0.8088, "step": 2050 }, { "epoch": 0.24152890139523978, "grad_norm": 1.061344027519226, "learning_rate": 9.854935912602857e-06, "loss": 0.8131, "step": 2060 }, { "epoch": 0.2427013717903623, "grad_norm": 1.133420705795288, "learning_rate": 9.85345842660151e-06, "loss": 0.7598, "step": 2070 }, { "epoch": 0.24387384218548483, "grad_norm": 1.3371222019195557, "learning_rate": 9.851973566423012e-06, "loss": 0.8351, "step": 2080 }, { "epoch": 0.24504631258060733, "grad_norm": 1.0554996728897095, "learning_rate": 9.850481334323411e-06, "loss": 0.7597, "step": 2090 }, { "epoch": 0.24621878297572986, "grad_norm": 0.8870165348052979, "learning_rate": 9.848981732569953e-06, "loss": 0.8059, "step": 2100 }, { "epoch": 0.24739125337085238, "grad_norm": 1.1154791116714478, "learning_rate": 9.847474763441079e-06, "loss": 0.8086, "step": 2110 }, { "epoch": 0.2485637237659749, "grad_norm": 1.0563223361968994, "learning_rate": 9.845960429226424e-06, "loss": 0.883, "step": 2120 }, { "epoch": 0.24973619416109744, "grad_norm": 1.1153672933578491, "learning_rate": 9.84443873222682e-06, "loss": 0.7105, "step": 2130 }, { "epoch": 0.25090866455621996, "grad_norm": 1.0710105895996094, "learning_rate": 9.842909674754279e-06, "loss": 0.8048, "step": 2140 }, { "epoch": 0.2520811349513425, "grad_norm": 1.041028380393982, "learning_rate": 9.841373259131998e-06, "loss": 0.8132, "step": 2150 }, { "epoch": 0.253253605346465, "grad_norm": 0.9315906167030334, "learning_rate": 9.839829487694352e-06, "loss": 0.7927, "step": 2160 }, { "epoch": 0.25442607574158754, "grad_norm": 1.0719690322875977, "learning_rate": 9.838278362786897e-06, "loss": 0.7859, "step": 2170 }, { "epoch": 0.25559854613671007, "grad_norm": 1.1046335697174072, "learning_rate": 9.836719886766357e-06, "loss": 0.8186, "step": 2180 }, { "epoch": 0.2567710165318326, "grad_norm": 1.0085625648498535, "learning_rate": 9.835154062000628e-06, "loss": 0.7994, "step": 2190 }, { "epoch": 0.25794348692695507, "grad_norm": 1.1517627239227295, "learning_rate": 9.833580890868768e-06, "loss": 0.8214, "step": 2200 }, { "epoch": 0.2591159573220776, "grad_norm": 1.130420446395874, "learning_rate": 9.832000375761e-06, "loss": 0.8316, "step": 2210 }, { "epoch": 0.2602884277172001, "grad_norm": 1.016338586807251, "learning_rate": 9.830412519078703e-06, "loss": 0.8, "step": 2220 }, { "epoch": 0.26146089811232265, "grad_norm": 0.9534006118774414, "learning_rate": 9.828817323234415e-06, "loss": 0.7203, "step": 2230 }, { "epoch": 0.2626333685074452, "grad_norm": 0.9415191411972046, "learning_rate": 9.827214790651815e-06, "loss": 0.756, "step": 2240 }, { "epoch": 0.2638058389025677, "grad_norm": 0.9768005609512329, "learning_rate": 9.825604923765744e-06, "loss": 0.7622, "step": 2250 }, { "epoch": 0.26497830929769023, "grad_norm": 0.9279842376708984, "learning_rate": 9.823987725022172e-06, "loss": 0.7854, "step": 2260 }, { "epoch": 0.26615077969281276, "grad_norm": 1.064520001411438, "learning_rate": 9.822363196878217e-06, "loss": 0.8272, "step": 2270 }, { "epoch": 0.2673232500879353, "grad_norm": 0.9873538017272949, "learning_rate": 9.820731341802132e-06, "loss": 0.7708, "step": 2280 }, { "epoch": 0.2684957204830578, "grad_norm": 1.1329238414764404, "learning_rate": 9.819092162273298e-06, "loss": 0.7895, "step": 2290 }, { "epoch": 0.26966819087818034, "grad_norm": 1.1268776655197144, "learning_rate": 9.817445660782232e-06, "loss": 0.7699, "step": 2300 }, { "epoch": 0.27084066127330286, "grad_norm": 0.9943812489509583, "learning_rate": 9.815791839830567e-06, "loss": 0.7652, "step": 2310 }, { "epoch": 0.2720131316684254, "grad_norm": 1.0386370420455933, "learning_rate": 9.814130701931065e-06, "loss": 0.7895, "step": 2320 }, { "epoch": 0.2731856020635479, "grad_norm": 0.9500758051872253, "learning_rate": 9.812462249607602e-06, "loss": 0.7438, "step": 2330 }, { "epoch": 0.27435807245867044, "grad_norm": 1.0351080894470215, "learning_rate": 9.810786485395162e-06, "loss": 0.7454, "step": 2340 }, { "epoch": 0.2755305428537929, "grad_norm": 1.0048526525497437, "learning_rate": 9.809103411839849e-06, "loss": 0.7593, "step": 2350 }, { "epoch": 0.27670301324891544, "grad_norm": 1.0440980195999146, "learning_rate": 9.807413031498864e-06, "loss": 0.8025, "step": 2360 }, { "epoch": 0.27787548364403797, "grad_norm": 1.1804531812667847, "learning_rate": 9.805715346940512e-06, "loss": 0.8112, "step": 2370 }, { "epoch": 0.2790479540391605, "grad_norm": 0.9199194312095642, "learning_rate": 9.804010360744199e-06, "loss": 0.7549, "step": 2380 }, { "epoch": 0.280220424434283, "grad_norm": 1.0155823230743408, "learning_rate": 9.802298075500418e-06, "loss": 0.7788, "step": 2390 }, { "epoch": 0.28139289482940555, "grad_norm": 1.1582207679748535, "learning_rate": 9.80057849381076e-06, "loss": 0.7545, "step": 2400 }, { "epoch": 0.2825653652245281, "grad_norm": 0.9761970639228821, "learning_rate": 9.798851618287896e-06, "loss": 0.7047, "step": 2410 }, { "epoch": 0.2837378356196506, "grad_norm": 0.9580684304237366, "learning_rate": 9.797117451555581e-06, "loss": 0.79, "step": 2420 }, { "epoch": 0.28491030601477313, "grad_norm": 1.0065553188323975, "learning_rate": 9.795375996248649e-06, "loss": 0.7529, "step": 2430 }, { "epoch": 0.28608277640989566, "grad_norm": 0.9667154550552368, "learning_rate": 9.793627255013007e-06, "loss": 0.782, "step": 2440 }, { "epoch": 0.2872552468050182, "grad_norm": 1.0504951477050781, "learning_rate": 9.791871230505631e-06, "loss": 0.741, "step": 2450 }, { "epoch": 0.2884277172001407, "grad_norm": 0.9188810586929321, "learning_rate": 9.790107925394564e-06, "loss": 0.7756, "step": 2460 }, { "epoch": 0.28960018759526324, "grad_norm": 1.031525731086731, "learning_rate": 9.788337342358915e-06, "loss": 0.8092, "step": 2470 }, { "epoch": 0.29077265799038576, "grad_norm": 1.1349053382873535, "learning_rate": 9.78655948408884e-06, "loss": 0.7688, "step": 2480 }, { "epoch": 0.2919451283855083, "grad_norm": 1.0804941654205322, "learning_rate": 9.78477435328556e-06, "loss": 0.7252, "step": 2490 }, { "epoch": 0.29311759878063076, "grad_norm": 0.9832829236984253, "learning_rate": 9.78298195266134e-06, "loss": 0.8108, "step": 2500 }, { "epoch": 0.29311759878063076, "eval_loss": 0.7872751951217651, "eval_model_preparation_time": 0.0, "eval_runtime": 2142.7494, "eval_samples_per_second": 3.538, "eval_steps_per_second": 1.769, "step": 2500 }, { "epoch": 0.2942900691757533, "grad_norm": 1.0062087774276733, "learning_rate": 9.781182284939493e-06, "loss": 0.8433, "step": 2510 }, { "epoch": 0.2954625395708758, "grad_norm": 1.0383495092391968, "learning_rate": 9.779375352854368e-06, "loss": 0.7776, "step": 2520 }, { "epoch": 0.29663500996599834, "grad_norm": 1.0248640775680542, "learning_rate": 9.777561159151358e-06, "loss": 0.7959, "step": 2530 }, { "epoch": 0.29780748036112087, "grad_norm": 1.460216760635376, "learning_rate": 9.775739706586887e-06, "loss": 0.8203, "step": 2540 }, { "epoch": 0.2989799507562434, "grad_norm": 1.0873559713363647, "learning_rate": 9.773910997928405e-06, "loss": 0.7608, "step": 2550 }, { "epoch": 0.3001524211513659, "grad_norm": 0.9547258615493774, "learning_rate": 9.772075035954391e-06, "loss": 0.7907, "step": 2560 }, { "epoch": 0.30132489154648845, "grad_norm": 1.0625553131103516, "learning_rate": 9.77023182345434e-06, "loss": 0.822, "step": 2570 }, { "epoch": 0.302497361941611, "grad_norm": 0.9914059638977051, "learning_rate": 9.768381363228767e-06, "loss": 0.7872, "step": 2580 }, { "epoch": 0.3036698323367335, "grad_norm": 1.0038251876831055, "learning_rate": 9.766523658089197e-06, "loss": 0.8141, "step": 2590 }, { "epoch": 0.30484230273185603, "grad_norm": 1.004831075668335, "learning_rate": 9.764658710858164e-06, "loss": 0.8141, "step": 2600 }, { "epoch": 0.30601477312697856, "grad_norm": 1.021079182624817, "learning_rate": 9.762786524369203e-06, "loss": 0.8248, "step": 2610 }, { "epoch": 0.3071872435221011, "grad_norm": 0.9352837204933167, "learning_rate": 9.760907101466848e-06, "loss": 0.7884, "step": 2620 }, { "epoch": 0.3083597139172236, "grad_norm": 1.111978530883789, "learning_rate": 9.759020445006634e-06, "loss": 0.794, "step": 2630 }, { "epoch": 0.30953218431234614, "grad_norm": 1.2116807699203491, "learning_rate": 9.757126557855077e-06, "loss": 0.7917, "step": 2640 }, { "epoch": 0.31070465470746866, "grad_norm": 1.0014216899871826, "learning_rate": 9.755225442889687e-06, "loss": 0.7941, "step": 2650 }, { "epoch": 0.31187712510259114, "grad_norm": 1.1326111555099487, "learning_rate": 9.753317102998949e-06, "loss": 0.7906, "step": 2660 }, { "epoch": 0.31304959549771366, "grad_norm": 1.1267919540405273, "learning_rate": 9.751401541082333e-06, "loss": 0.7683, "step": 2670 }, { "epoch": 0.3142220658928362, "grad_norm": 1.0329381227493286, "learning_rate": 9.749478760050275e-06, "loss": 0.8287, "step": 2680 }, { "epoch": 0.3153945362879587, "grad_norm": 1.0036214590072632, "learning_rate": 9.747548762824182e-06, "loss": 0.8236, "step": 2690 }, { "epoch": 0.31656700668308124, "grad_norm": 0.9548210501670837, "learning_rate": 9.745611552336426e-06, "loss": 0.7239, "step": 2700 }, { "epoch": 0.31773947707820377, "grad_norm": 1.071669101715088, "learning_rate": 9.743667131530337e-06, "loss": 0.7551, "step": 2710 }, { "epoch": 0.3189119474733263, "grad_norm": 1.0986937284469604, "learning_rate": 9.741715503360201e-06, "loss": 0.7935, "step": 2720 }, { "epoch": 0.3200844178684488, "grad_norm": 0.982127845287323, "learning_rate": 9.739756670791257e-06, "loss": 0.8311, "step": 2730 }, { "epoch": 0.32125688826357135, "grad_norm": 1.0486985445022583, "learning_rate": 9.737790636799683e-06, "loss": 0.7821, "step": 2740 }, { "epoch": 0.3224293586586939, "grad_norm": 1.071585774421692, "learning_rate": 9.735817404372605e-06, "loss": 0.8158, "step": 2750 }, { "epoch": 0.3236018290538164, "grad_norm": 1.2274806499481201, "learning_rate": 9.733836976508086e-06, "loss": 0.7818, "step": 2760 }, { "epoch": 0.32477429944893893, "grad_norm": 1.3032951354980469, "learning_rate": 9.731849356215118e-06, "loss": 0.7814, "step": 2770 }, { "epoch": 0.32594676984406146, "grad_norm": 1.1242070198059082, "learning_rate": 9.729854546513622e-06, "loss": 0.7713, "step": 2780 }, { "epoch": 0.327119240239184, "grad_norm": 0.9936765432357788, "learning_rate": 9.727852550434445e-06, "loss": 0.7793, "step": 2790 }, { "epoch": 0.3282917106343065, "grad_norm": 1.1073092222213745, "learning_rate": 9.72584337101935e-06, "loss": 0.7735, "step": 2800 }, { "epoch": 0.329464181029429, "grad_norm": 0.945728600025177, "learning_rate": 9.723827011321014e-06, "loss": 0.7903, "step": 2810 }, { "epoch": 0.3306366514245515, "grad_norm": 1.0138789415359497, "learning_rate": 9.721803474403024e-06, "loss": 0.8026, "step": 2820 }, { "epoch": 0.33180912181967404, "grad_norm": 1.1122463941574097, "learning_rate": 9.719772763339875e-06, "loss": 0.7608, "step": 2830 }, { "epoch": 0.33298159221479656, "grad_norm": 0.9616718888282776, "learning_rate": 9.717734881216956e-06, "loss": 0.7362, "step": 2840 }, { "epoch": 0.3341540626099191, "grad_norm": 0.9542321562767029, "learning_rate": 9.715689831130557e-06, "loss": 0.7441, "step": 2850 }, { "epoch": 0.3353265330050416, "grad_norm": 1.18282151222229, "learning_rate": 9.713637616187854e-06, "loss": 0.8049, "step": 2860 }, { "epoch": 0.33649900340016414, "grad_norm": 0.985284686088562, "learning_rate": 9.711578239506914e-06, "loss": 0.7232, "step": 2870 }, { "epoch": 0.33767147379528667, "grad_norm": 1.0367258787155151, "learning_rate": 9.709511704216681e-06, "loss": 0.7196, "step": 2880 }, { "epoch": 0.3388439441904092, "grad_norm": 1.1190388202667236, "learning_rate": 9.707438013456979e-06, "loss": 0.7226, "step": 2890 }, { "epoch": 0.3400164145855317, "grad_norm": 1.2934350967407227, "learning_rate": 9.7053571703785e-06, "loss": 0.8353, "step": 2900 }, { "epoch": 0.34118888498065425, "grad_norm": 0.9021798372268677, "learning_rate": 9.703269178142807e-06, "loss": 0.7805, "step": 2910 }, { "epoch": 0.3423613553757768, "grad_norm": 1.6425260305404663, "learning_rate": 9.701174039922323e-06, "loss": 0.7522, "step": 2920 }, { "epoch": 0.3435338257708993, "grad_norm": 1.082169532775879, "learning_rate": 9.699071758900329e-06, "loss": 0.7929, "step": 2930 }, { "epoch": 0.34470629616602183, "grad_norm": 1.0972063541412354, "learning_rate": 9.696962338270957e-06, "loss": 0.8722, "step": 2940 }, { "epoch": 0.34587876656114436, "grad_norm": 1.036820411682129, "learning_rate": 9.694845781239188e-06, "loss": 0.746, "step": 2950 }, { "epoch": 0.34705123695626683, "grad_norm": 1.1242300271987915, "learning_rate": 9.692722091020846e-06, "loss": 0.7661, "step": 2960 }, { "epoch": 0.34822370735138936, "grad_norm": 1.1167043447494507, "learning_rate": 9.690591270842594e-06, "loss": 0.7393, "step": 2970 }, { "epoch": 0.3493961777465119, "grad_norm": 0.9426207542419434, "learning_rate": 9.688453323941925e-06, "loss": 0.7894, "step": 2980 }, { "epoch": 0.3505686481416344, "grad_norm": 1.037908911705017, "learning_rate": 9.68630825356716e-06, "loss": 0.7354, "step": 2990 }, { "epoch": 0.35174111853675694, "grad_norm": 0.9614027738571167, "learning_rate": 9.68415606297745e-06, "loss": 0.7538, "step": 3000 }, { "epoch": 0.35174111853675694, "eval_loss": 0.7815088629722595, "eval_model_preparation_time": 0.0, "eval_runtime": 2141.9353, "eval_samples_per_second": 3.54, "eval_steps_per_second": 1.77, "step": 3000 }, { "epoch": 0.35291358893187946, "grad_norm": 1.0366264581680298, "learning_rate": 9.681996755442753e-06, "loss": 0.7342, "step": 3010 }, { "epoch": 0.354086059327002, "grad_norm": 1.0886765718460083, "learning_rate": 9.679830334243852e-06, "loss": 0.8028, "step": 3020 }, { "epoch": 0.3552585297221245, "grad_norm": 0.9815817475318909, "learning_rate": 9.677656802672328e-06, "loss": 0.7434, "step": 3030 }, { "epoch": 0.35643100011724704, "grad_norm": 1.108848214149475, "learning_rate": 9.675476164030573e-06, "loss": 0.7578, "step": 3040 }, { "epoch": 0.35760347051236957, "grad_norm": 0.9668612480163574, "learning_rate": 9.673288421631771e-06, "loss": 0.7868, "step": 3050 }, { "epoch": 0.3587759409074921, "grad_norm": 1.0971283912658691, "learning_rate": 9.671093578799906e-06, "loss": 0.818, "step": 3060 }, { "epoch": 0.3599484113026146, "grad_norm": 1.0179086923599243, "learning_rate": 9.668891638869742e-06, "loss": 0.7695, "step": 3070 }, { "epoch": 0.36112088169773715, "grad_norm": 1.0052766799926758, "learning_rate": 9.666682605186834e-06, "loss": 0.793, "step": 3080 }, { "epoch": 0.3622933520928597, "grad_norm": 1.117011547088623, "learning_rate": 9.66446648110751e-06, "loss": 0.8371, "step": 3090 }, { "epoch": 0.3634658224879822, "grad_norm": 1.09272301197052, "learning_rate": 9.662243269998875e-06, "loss": 0.8061, "step": 3100 }, { "epoch": 0.3646382928831047, "grad_norm": 1.1492865085601807, "learning_rate": 9.660012975238796e-06, "loss": 0.7634, "step": 3110 }, { "epoch": 0.3658107632782272, "grad_norm": 1.0213364362716675, "learning_rate": 9.657775600215906e-06, "loss": 0.7386, "step": 3120 }, { "epoch": 0.36698323367334973, "grad_norm": 1.0134015083312988, "learning_rate": 9.655531148329595e-06, "loss": 0.7816, "step": 3130 }, { "epoch": 0.36815570406847226, "grad_norm": 1.3230915069580078, "learning_rate": 9.653279622990009e-06, "loss": 0.8131, "step": 3140 }, { "epoch": 0.3693281744635948, "grad_norm": 1.0691282749176025, "learning_rate": 9.651021027618035e-06, "loss": 0.8045, "step": 3150 }, { "epoch": 0.3705006448587173, "grad_norm": 1.1193733215332031, "learning_rate": 9.648755365645306e-06, "loss": 0.7588, "step": 3160 }, { "epoch": 0.37167311525383984, "grad_norm": 1.1530718803405762, "learning_rate": 9.64648264051419e-06, "loss": 0.8282, "step": 3170 }, { "epoch": 0.37284558564896236, "grad_norm": 1.0649746656417847, "learning_rate": 9.644202855677786e-06, "loss": 0.7748, "step": 3180 }, { "epoch": 0.3740180560440849, "grad_norm": 0.9248124957084656, "learning_rate": 9.641916014599923e-06, "loss": 0.7743, "step": 3190 }, { "epoch": 0.3751905264392074, "grad_norm": 1.0307985544204712, "learning_rate": 9.639622120755148e-06, "loss": 0.7179, "step": 3200 }, { "epoch": 0.37636299683432994, "grad_norm": 1.140249252319336, "learning_rate": 9.63732117762872e-06, "loss": 0.7957, "step": 3210 }, { "epoch": 0.37753546722945247, "grad_norm": 1.1943104267120361, "learning_rate": 9.635013188716617e-06, "loss": 0.7863, "step": 3220 }, { "epoch": 0.378707937624575, "grad_norm": 0.9520907402038574, "learning_rate": 9.632698157525517e-06, "loss": 0.7696, "step": 3230 }, { "epoch": 0.3798804080196975, "grad_norm": 1.045203685760498, "learning_rate": 9.630376087572798e-06, "loss": 0.7738, "step": 3240 }, { "epoch": 0.38105287841482005, "grad_norm": 1.0103857517242432, "learning_rate": 9.628046982386531e-06, "loss": 0.7494, "step": 3250 }, { "epoch": 0.3822253488099425, "grad_norm": 1.1064127683639526, "learning_rate": 9.625710845505482e-06, "loss": 0.7801, "step": 3260 }, { "epoch": 0.38339781920506505, "grad_norm": 1.2207891941070557, "learning_rate": 9.623367680479092e-06, "loss": 0.7846, "step": 3270 }, { "epoch": 0.3845702896001876, "grad_norm": 0.9667770266532898, "learning_rate": 9.621017490867488e-06, "loss": 0.8037, "step": 3280 }, { "epoch": 0.3857427599953101, "grad_norm": 0.9694348573684692, "learning_rate": 9.618660280241468e-06, "loss": 0.7447, "step": 3290 }, { "epoch": 0.38691523039043263, "grad_norm": 0.9919257164001465, "learning_rate": 9.616296052182493e-06, "loss": 0.8115, "step": 3300 }, { "epoch": 0.38808770078555516, "grad_norm": 1.0565401315689087, "learning_rate": 9.613924810282692e-06, "loss": 0.774, "step": 3310 }, { "epoch": 0.3892601711806777, "grad_norm": 1.0694124698638916, "learning_rate": 9.611546558144846e-06, "loss": 0.7711, "step": 3320 }, { "epoch": 0.3904326415758002, "grad_norm": 0.9765451550483704, "learning_rate": 9.609161299382392e-06, "loss": 0.7072, "step": 3330 }, { "epoch": 0.39160511197092274, "grad_norm": 1.255078673362732, "learning_rate": 9.606769037619408e-06, "loss": 0.8163, "step": 3340 }, { "epoch": 0.39277758236604526, "grad_norm": 0.9939749240875244, "learning_rate": 9.604369776490615e-06, "loss": 0.7388, "step": 3350 }, { "epoch": 0.3939500527611678, "grad_norm": 0.9769892692565918, "learning_rate": 9.601963519641365e-06, "loss": 0.7559, "step": 3360 }, { "epoch": 0.3951225231562903, "grad_norm": 1.035705804824829, "learning_rate": 9.599550270727643e-06, "loss": 0.7715, "step": 3370 }, { "epoch": 0.39629499355141284, "grad_norm": 1.5238893032073975, "learning_rate": 9.597130033416058e-06, "loss": 0.7975, "step": 3380 }, { "epoch": 0.39746746394653537, "grad_norm": 1.0177576541900635, "learning_rate": 9.594702811383833e-06, "loss": 0.7578, "step": 3390 }, { "epoch": 0.3986399343416579, "grad_norm": 0.9629125595092773, "learning_rate": 9.592268608318804e-06, "loss": 0.7308, "step": 3400 }, { "epoch": 0.39981240473678037, "grad_norm": 1.1071351766586304, "learning_rate": 9.589827427919418e-06, "loss": 0.7432, "step": 3410 }, { "epoch": 0.4009848751319029, "grad_norm": 1.0955792665481567, "learning_rate": 9.587379273894722e-06, "loss": 0.7541, "step": 3420 }, { "epoch": 0.4021573455270254, "grad_norm": 1.0997943878173828, "learning_rate": 9.58492414996435e-06, "loss": 0.8535, "step": 3430 }, { "epoch": 0.40332981592214795, "grad_norm": 1.1344927549362183, "learning_rate": 9.582462059858537e-06, "loss": 0.7453, "step": 3440 }, { "epoch": 0.4045022863172705, "grad_norm": 1.1749157905578613, "learning_rate": 9.579993007318098e-06, "loss": 0.8208, "step": 3450 }, { "epoch": 0.405674756712393, "grad_norm": 1.0422290563583374, "learning_rate": 9.577516996094423e-06, "loss": 0.7635, "step": 3460 }, { "epoch": 0.40684722710751553, "grad_norm": 1.1056243181228638, "learning_rate": 9.575034029949481e-06, "loss": 0.7327, "step": 3470 }, { "epoch": 0.40801969750263806, "grad_norm": 1.039713978767395, "learning_rate": 9.572544112655806e-06, "loss": 0.7173, "step": 3480 }, { "epoch": 0.4091921678977606, "grad_norm": 1.0058737993240356, "learning_rate": 9.570047247996488e-06, "loss": 0.8225, "step": 3490 }, { "epoch": 0.4103646382928831, "grad_norm": 1.1531774997711182, "learning_rate": 9.567543439765182e-06, "loss": 0.7856, "step": 3500 }, { "epoch": 0.4103646382928831, "eval_loss": 0.7753819823265076, "eval_model_preparation_time": 0.0, "eval_runtime": 2148.5991, "eval_samples_per_second": 3.529, "eval_steps_per_second": 1.764, "step": 3500 }, { "epoch": 0.41153710868800564, "grad_norm": 1.0665730237960815, "learning_rate": 9.565032691766084e-06, "loss": 0.807, "step": 3510 }, { "epoch": 0.41270957908312816, "grad_norm": 1.015415072441101, "learning_rate": 9.562515007813938e-06, "loss": 0.7501, "step": 3520 }, { "epoch": 0.4138820494782507, "grad_norm": 0.9460575580596924, "learning_rate": 9.559990391734026e-06, "loss": 0.783, "step": 3530 }, { "epoch": 0.4150545198733732, "grad_norm": 1.1467220783233643, "learning_rate": 9.557458847362165e-06, "loss": 0.7567, "step": 3540 }, { "epoch": 0.41622699026849574, "grad_norm": 1.1130205392837524, "learning_rate": 9.554920378544694e-06, "loss": 0.7809, "step": 3550 }, { "epoch": 0.41739946066361827, "grad_norm": 0.9414743781089783, "learning_rate": 9.552374989138478e-06, "loss": 0.7629, "step": 3560 }, { "epoch": 0.41857193105874074, "grad_norm": 1.0594080686569214, "learning_rate": 9.54982268301089e-06, "loss": 0.7972, "step": 3570 }, { "epoch": 0.41974440145386327, "grad_norm": 1.0567100048065186, "learning_rate": 9.547263464039817e-06, "loss": 0.7144, "step": 3580 }, { "epoch": 0.4209168718489858, "grad_norm": 1.1648190021514893, "learning_rate": 9.544697336113646e-06, "loss": 0.7807, "step": 3590 }, { "epoch": 0.4220893422441083, "grad_norm": 1.5024479627609253, "learning_rate": 9.542124303131267e-06, "loss": 0.7456, "step": 3600 }, { "epoch": 0.42326181263923085, "grad_norm": 1.0676318407058716, "learning_rate": 9.539544369002058e-06, "loss": 0.7601, "step": 3610 }, { "epoch": 0.4244342830343534, "grad_norm": 0.9238914847373962, "learning_rate": 9.536957537645877e-06, "loss": 0.7226, "step": 3620 }, { "epoch": 0.4256067534294759, "grad_norm": 1.0641636848449707, "learning_rate": 9.534363812993068e-06, "loss": 0.7389, "step": 3630 }, { "epoch": 0.42677922382459843, "grad_norm": 1.1077290773391724, "learning_rate": 9.53176319898445e-06, "loss": 0.8195, "step": 3640 }, { "epoch": 0.42795169421972096, "grad_norm": 0.9798630475997925, "learning_rate": 9.529155699571301e-06, "loss": 0.8015, "step": 3650 }, { "epoch": 0.4291241646148435, "grad_norm": 1.0194880962371826, "learning_rate": 9.52654131871537e-06, "loss": 0.7433, "step": 3660 }, { "epoch": 0.430296635009966, "grad_norm": 1.1731258630752563, "learning_rate": 9.523920060388853e-06, "loss": 0.7766, "step": 3670 }, { "epoch": 0.43146910540508854, "grad_norm": 1.0388282537460327, "learning_rate": 9.521291928574404e-06, "loss": 0.8012, "step": 3680 }, { "epoch": 0.43264157580021106, "grad_norm": 1.0118495225906372, "learning_rate": 9.518656927265111e-06, "loss": 0.7634, "step": 3690 }, { "epoch": 0.4338140461953336, "grad_norm": 0.9954714179039001, "learning_rate": 9.516015060464508e-06, "loss": 0.7248, "step": 3700 }, { "epoch": 0.4349865165904561, "grad_norm": 1.018477439880371, "learning_rate": 9.513366332186554e-06, "loss": 0.764, "step": 3710 }, { "epoch": 0.4361589869855786, "grad_norm": 0.9497597217559814, "learning_rate": 9.510710746455636e-06, "loss": 0.7384, "step": 3720 }, { "epoch": 0.4373314573807011, "grad_norm": 1.0151541233062744, "learning_rate": 9.50804830730656e-06, "loss": 0.8276, "step": 3730 }, { "epoch": 0.43850392777582364, "grad_norm": 0.8733408451080322, "learning_rate": 9.505379018784543e-06, "loss": 0.7407, "step": 3740 }, { "epoch": 0.43967639817094617, "grad_norm": 0.9291737079620361, "learning_rate": 9.502702884945207e-06, "loss": 0.6816, "step": 3750 }, { "epoch": 0.4408488685660687, "grad_norm": 1.053335428237915, "learning_rate": 9.500019909854581e-06, "loss": 0.815, "step": 3760 }, { "epoch": 0.4420213389611912, "grad_norm": 1.0885114669799805, "learning_rate": 9.497330097589085e-06, "loss": 0.8501, "step": 3770 }, { "epoch": 0.44319380935631375, "grad_norm": 1.0059987306594849, "learning_rate": 9.494633452235528e-06, "loss": 0.8162, "step": 3780 }, { "epoch": 0.4443662797514363, "grad_norm": 1.1023715734481812, "learning_rate": 9.491929977891095e-06, "loss": 0.7731, "step": 3790 }, { "epoch": 0.4455387501465588, "grad_norm": 1.2773202657699585, "learning_rate": 9.489219678663356e-06, "loss": 0.7754, "step": 3800 }, { "epoch": 0.44671122054168133, "grad_norm": 1.105835199356079, "learning_rate": 9.486502558670244e-06, "loss": 0.8099, "step": 3810 }, { "epoch": 0.44788369093680386, "grad_norm": 1.1122444868087769, "learning_rate": 9.483778622040057e-06, "loss": 0.742, "step": 3820 }, { "epoch": 0.4490561613319264, "grad_norm": 1.00811767578125, "learning_rate": 9.48104787291145e-06, "loss": 0.7334, "step": 3830 }, { "epoch": 0.4502286317270489, "grad_norm": 1.0694729089736938, "learning_rate": 9.478310315433432e-06, "loss": 0.7857, "step": 3840 }, { "epoch": 0.45140110212217144, "grad_norm": 0.9792755246162415, "learning_rate": 9.475565953765352e-06, "loss": 0.7705, "step": 3850 }, { "epoch": 0.45257357251729396, "grad_norm": 1.0651715993881226, "learning_rate": 9.472814792076894e-06, "loss": 0.7862, "step": 3860 }, { "epoch": 0.45374604291241644, "grad_norm": 1.2940688133239746, "learning_rate": 9.470056834548084e-06, "loss": 0.7877, "step": 3870 }, { "epoch": 0.45491851330753896, "grad_norm": 1.4561967849731445, "learning_rate": 9.467292085369264e-06, "loss": 0.8124, "step": 3880 }, { "epoch": 0.4560909837026615, "grad_norm": 1.286630630493164, "learning_rate": 9.464520548741101e-06, "loss": 0.7778, "step": 3890 }, { "epoch": 0.457263454097784, "grad_norm": 1.0348161458969116, "learning_rate": 9.461742228874568e-06, "loss": 0.7978, "step": 3900 }, { "epoch": 0.45843592449290654, "grad_norm": 0.9772406220436096, "learning_rate": 9.458957129990953e-06, "loss": 0.8688, "step": 3910 }, { "epoch": 0.45960839488802907, "grad_norm": 0.946616530418396, "learning_rate": 9.456165256321836e-06, "loss": 0.7477, "step": 3920 }, { "epoch": 0.4607808652831516, "grad_norm": 0.9942333102226257, "learning_rate": 9.453366612109095e-06, "loss": 0.7508, "step": 3930 }, { "epoch": 0.4619533356782741, "grad_norm": 1.2723950147628784, "learning_rate": 9.450561201604892e-06, "loss": 0.7248, "step": 3940 }, { "epoch": 0.46312580607339665, "grad_norm": 1.042472004890442, "learning_rate": 9.447749029071672e-06, "loss": 0.7264, "step": 3950 }, { "epoch": 0.4642982764685192, "grad_norm": 1.0849319696426392, "learning_rate": 9.444930098782152e-06, "loss": 0.7636, "step": 3960 }, { "epoch": 0.4654707468636417, "grad_norm": 1.0128146409988403, "learning_rate": 9.44210441501932e-06, "loss": 0.7123, "step": 3970 }, { "epoch": 0.46664321725876423, "grad_norm": 0.9191023707389832, "learning_rate": 9.439271982076417e-06, "loss": 0.754, "step": 3980 }, { "epoch": 0.46781568765388676, "grad_norm": 1.1305320262908936, "learning_rate": 9.436432804256949e-06, "loss": 0.7184, "step": 3990 }, { "epoch": 0.4689881580490093, "grad_norm": 1.0158164501190186, "learning_rate": 9.433586885874662e-06, "loss": 0.8379, "step": 4000 }, { "epoch": 0.4689881580490093, "eval_loss": 0.7699258923530579, "eval_model_preparation_time": 0.0, "eval_runtime": 2148.008, "eval_samples_per_second": 3.53, "eval_steps_per_second": 1.765, "step": 4000 }, { "epoch": 0.4701606284441318, "grad_norm": 0.9390503168106079, "learning_rate": 9.430734231253546e-06, "loss": 0.69, "step": 4010 }, { "epoch": 0.4713330988392543, "grad_norm": 1.080346703529358, "learning_rate": 9.427874844727827e-06, "loss": 0.7833, "step": 4020 }, { "epoch": 0.4725055692343768, "grad_norm": 1.1072280406951904, "learning_rate": 9.425008730641961e-06, "loss": 0.819, "step": 4030 }, { "epoch": 0.47367803962949934, "grad_norm": 1.0966583490371704, "learning_rate": 9.422135893350615e-06, "loss": 0.7861, "step": 4040 }, { "epoch": 0.47485051002462186, "grad_norm": 0.89730304479599, "learning_rate": 9.419256337218686e-06, "loss": 0.7961, "step": 4050 }, { "epoch": 0.4760229804197444, "grad_norm": 1.006612777709961, "learning_rate": 9.41637006662127e-06, "loss": 0.7875, "step": 4060 }, { "epoch": 0.4771954508148669, "grad_norm": 1.051352620124817, "learning_rate": 9.413477085943665e-06, "loss": 0.7213, "step": 4070 }, { "epoch": 0.47836792120998944, "grad_norm": 0.9035906791687012, "learning_rate": 9.410577399581369e-06, "loss": 0.8104, "step": 4080 }, { "epoch": 0.47954039160511197, "grad_norm": 1.075790524482727, "learning_rate": 9.407671011940063e-06, "loss": 0.7115, "step": 4090 }, { "epoch": 0.4807128620002345, "grad_norm": 1.4569838047027588, "learning_rate": 9.404757927435613e-06, "loss": 0.8222, "step": 4100 }, { "epoch": 0.481885332395357, "grad_norm": 0.9240731000900269, "learning_rate": 9.401838150494057e-06, "loss": 0.7452, "step": 4110 }, { "epoch": 0.48305780279047955, "grad_norm": 1.2677507400512695, "learning_rate": 9.398911685551607e-06, "loss": 0.8047, "step": 4120 }, { "epoch": 0.4842302731856021, "grad_norm": 1.1028954982757568, "learning_rate": 9.395978537054631e-06, "loss": 0.7758, "step": 4130 }, { "epoch": 0.4854027435807246, "grad_norm": 0.9975004196166992, "learning_rate": 9.393038709459654e-06, "loss": 0.8269, "step": 4140 }, { "epoch": 0.48657521397584713, "grad_norm": 1.1160857677459717, "learning_rate": 9.390092207233346e-06, "loss": 0.7547, "step": 4150 }, { "epoch": 0.48774768437096966, "grad_norm": 1.0660200119018555, "learning_rate": 9.387139034852524e-06, "loss": 0.7613, "step": 4160 }, { "epoch": 0.48892015476609213, "grad_norm": 1.143187403678894, "learning_rate": 9.384179196804134e-06, "loss": 0.7393, "step": 4170 }, { "epoch": 0.49009262516121466, "grad_norm": 1.1493088006973267, "learning_rate": 9.381212697585253e-06, "loss": 0.7624, "step": 4180 }, { "epoch": 0.4912650955563372, "grad_norm": 1.0811346769332886, "learning_rate": 9.378239541703078e-06, "loss": 0.7455, "step": 4190 }, { "epoch": 0.4924375659514597, "grad_norm": 0.9973095655441284, "learning_rate": 9.375259733674917e-06, "loss": 0.7383, "step": 4200 }, { "epoch": 0.49361003634658224, "grad_norm": 1.1703022718429565, "learning_rate": 9.37227327802819e-06, "loss": 0.8025, "step": 4210 }, { "epoch": 0.49478250674170476, "grad_norm": 1.0889148712158203, "learning_rate": 9.369280179300413e-06, "loss": 0.7427, "step": 4220 }, { "epoch": 0.4959549771368273, "grad_norm": 1.0385446548461914, "learning_rate": 9.366280442039198e-06, "loss": 0.7678, "step": 4230 }, { "epoch": 0.4971274475319498, "grad_norm": 0.9212861061096191, "learning_rate": 9.363274070802242e-06, "loss": 0.7337, "step": 4240 }, { "epoch": 0.49829991792707234, "grad_norm": 1.0867843627929688, "learning_rate": 9.360261070157319e-06, "loss": 0.7601, "step": 4250 }, { "epoch": 0.49947238832219487, "grad_norm": 1.1652419567108154, "learning_rate": 9.357241444682282e-06, "loss": 0.7376, "step": 4260 }, { "epoch": 0.5006448587173173, "grad_norm": 0.8776029348373413, "learning_rate": 9.354215198965045e-06, "loss": 0.7678, "step": 4270 }, { "epoch": 0.5018173291124399, "grad_norm": 1.0649455785751343, "learning_rate": 9.351182337603579e-06, "loss": 0.8023, "step": 4280 }, { "epoch": 0.5029897995075624, "grad_norm": 1.0543705224990845, "learning_rate": 9.34814286520591e-06, "loss": 0.7638, "step": 4290 }, { "epoch": 0.504162269902685, "grad_norm": 1.1163556575775146, "learning_rate": 9.345096786390107e-06, "loss": 0.7694, "step": 4300 }, { "epoch": 0.5053347402978074, "grad_norm": 1.2409310340881348, "learning_rate": 9.342044105784278e-06, "loss": 0.7862, "step": 4310 }, { "epoch": 0.50650721069293, "grad_norm": 1.1419397592544556, "learning_rate": 9.33898482802656e-06, "loss": 0.7825, "step": 4320 }, { "epoch": 0.5076796810880525, "grad_norm": 1.0327321290969849, "learning_rate": 9.335918957765115e-06, "loss": 0.7624, "step": 4330 }, { "epoch": 0.5088521514831751, "grad_norm": 0.9816399216651917, "learning_rate": 9.33284649965812e-06, "loss": 0.7791, "step": 4340 }, { "epoch": 0.5100246218782976, "grad_norm": 1.1001486778259277, "learning_rate": 9.329767458373758e-06, "loss": 0.7689, "step": 4350 }, { "epoch": 0.5111970922734201, "grad_norm": 0.9570380449295044, "learning_rate": 9.326681838590224e-06, "loss": 0.7147, "step": 4360 }, { "epoch": 0.5123695626685426, "grad_norm": 0.9480714201927185, "learning_rate": 9.323589644995697e-06, "loss": 0.7985, "step": 4370 }, { "epoch": 0.5135420330636652, "grad_norm": 0.9975506067276001, "learning_rate": 9.320490882288353e-06, "loss": 0.7493, "step": 4380 }, { "epoch": 0.5147145034587877, "grad_norm": 0.9568571448326111, "learning_rate": 9.317385555176343e-06, "loss": 0.7643, "step": 4390 }, { "epoch": 0.5158869738539101, "grad_norm": 1.0263663530349731, "learning_rate": 9.314273668377796e-06, "loss": 0.7295, "step": 4400 }, { "epoch": 0.5170594442490327, "grad_norm": 0.950812816619873, "learning_rate": 9.311155226620802e-06, "loss": 0.7676, "step": 4410 }, { "epoch": 0.5182319146441552, "grad_norm": 0.9604426026344299, "learning_rate": 9.308030234643417e-06, "loss": 0.8295, "step": 4420 }, { "epoch": 0.5194043850392778, "grad_norm": 1.022865891456604, "learning_rate": 9.304898697193643e-06, "loss": 0.7751, "step": 4430 }, { "epoch": 0.5205768554344002, "grad_norm": 1.1289923191070557, "learning_rate": 9.301760619029432e-06, "loss": 0.7556, "step": 4440 }, { "epoch": 0.5217493258295228, "grad_norm": 1.2246206998825073, "learning_rate": 9.298616004918671e-06, "loss": 0.7662, "step": 4450 }, { "epoch": 0.5229217962246453, "grad_norm": 0.9444355368614197, "learning_rate": 9.295464859639179e-06, "loss": 0.7628, "step": 4460 }, { "epoch": 0.5240942666197679, "grad_norm": 1.0577831268310547, "learning_rate": 9.292307187978693e-06, "loss": 0.7728, "step": 4470 }, { "epoch": 0.5252667370148904, "grad_norm": 1.6070692539215088, "learning_rate": 9.289142994734877e-06, "loss": 0.6846, "step": 4480 }, { "epoch": 0.5264392074100129, "grad_norm": 1.11727774143219, "learning_rate": 9.285972284715291e-06, "loss": 0.7317, "step": 4490 }, { "epoch": 0.5276116778051354, "grad_norm": 0.9389254450798035, "learning_rate": 9.282795062737406e-06, "loss": 0.7317, "step": 4500 }, { "epoch": 0.5276116778051354, "eval_loss": 0.7636198401451111, "eval_model_preparation_time": 0.0, "eval_runtime": 2150.1891, "eval_samples_per_second": 3.526, "eval_steps_per_second": 1.763, "step": 4500 }, { "epoch": 0.528784148200258, "grad_norm": 1.0441845655441284, "learning_rate": 9.279611333628582e-06, "loss": 0.7424, "step": 4510 }, { "epoch": 0.5299566185953805, "grad_norm": 1.0149439573287964, "learning_rate": 9.276421102226068e-06, "loss": 0.7616, "step": 4520 }, { "epoch": 0.531129088990503, "grad_norm": 1.1834802627563477, "learning_rate": 9.27322437337699e-06, "loss": 0.766, "step": 4530 }, { "epoch": 0.5323015593856255, "grad_norm": 0.9238637089729309, "learning_rate": 9.27002115193835e-06, "loss": 0.6836, "step": 4540 }, { "epoch": 0.533474029780748, "grad_norm": 1.044451117515564, "learning_rate": 9.266811442777006e-06, "loss": 0.7772, "step": 4550 }, { "epoch": 0.5346465001758706, "grad_norm": 1.1899291276931763, "learning_rate": 9.263595250769686e-06, "loss": 0.7311, "step": 4560 }, { "epoch": 0.535818970570993, "grad_norm": 1.1379649639129639, "learning_rate": 9.26037258080296e-06, "loss": 0.8378, "step": 4570 }, { "epoch": 0.5369914409661156, "grad_norm": 1.0653419494628906, "learning_rate": 9.257143437773238e-06, "loss": 0.7079, "step": 4580 }, { "epoch": 0.5381639113612381, "grad_norm": 1.0314478874206543, "learning_rate": 9.253907826586772e-06, "loss": 0.794, "step": 4590 }, { "epoch": 0.5393363817563607, "grad_norm": 1.1861941814422607, "learning_rate": 9.250665752159635e-06, "loss": 0.7291, "step": 4600 }, { "epoch": 0.5405088521514831, "grad_norm": 1.4750096797943115, "learning_rate": 9.247417219417724e-06, "loss": 0.8174, "step": 4610 }, { "epoch": 0.5416813225466057, "grad_norm": 1.0834195613861084, "learning_rate": 9.24416223329675e-06, "loss": 0.8059, "step": 4620 }, { "epoch": 0.5428537929417282, "grad_norm": 1.0474600791931152, "learning_rate": 9.240900798742223e-06, "loss": 0.7543, "step": 4630 }, { "epoch": 0.5440262633368508, "grad_norm": 1.1119709014892578, "learning_rate": 9.237632920709456e-06, "loss": 0.7627, "step": 4640 }, { "epoch": 0.5451987337319733, "grad_norm": 1.0830745697021484, "learning_rate": 9.234358604163549e-06, "loss": 0.819, "step": 4650 }, { "epoch": 0.5463712041270958, "grad_norm": 1.202317476272583, "learning_rate": 9.231077854079385e-06, "loss": 0.8103, "step": 4660 }, { "epoch": 0.5475436745222183, "grad_norm": 1.5650395154953003, "learning_rate": 9.227790675441625e-06, "loss": 0.7493, "step": 4670 }, { "epoch": 0.5487161449173409, "grad_norm": 1.0369755029678345, "learning_rate": 9.224497073244692e-06, "loss": 0.7294, "step": 4680 }, { "epoch": 0.5498886153124634, "grad_norm": 1.040549635887146, "learning_rate": 9.221197052492772e-06, "loss": 0.7439, "step": 4690 }, { "epoch": 0.5510610857075858, "grad_norm": 1.0199238061904907, "learning_rate": 9.217890618199805e-06, "loss": 0.6343, "step": 4700 }, { "epoch": 0.5522335561027084, "grad_norm": 0.9841470122337341, "learning_rate": 9.214577775389471e-06, "loss": 0.7444, "step": 4710 }, { "epoch": 0.5534060264978309, "grad_norm": 1.1280114650726318, "learning_rate": 9.211258529095188e-06, "loss": 0.77, "step": 4720 }, { "epoch": 0.5545784968929535, "grad_norm": 1.0666847229003906, "learning_rate": 9.207932884360105e-06, "loss": 0.7759, "step": 4730 }, { "epoch": 0.5557509672880759, "grad_norm": 1.0521173477172852, "learning_rate": 9.204600846237094e-06, "loss": 0.75, "step": 4740 }, { "epoch": 0.5569234376831985, "grad_norm": 1.1258000135421753, "learning_rate": 9.201262419788735e-06, "loss": 0.7958, "step": 4750 }, { "epoch": 0.558095908078321, "grad_norm": 1.018381118774414, "learning_rate": 9.19791761008732e-06, "loss": 0.7431, "step": 4760 }, { "epoch": 0.5592683784734436, "grad_norm": 1.1162135601043701, "learning_rate": 9.194566422214838e-06, "loss": 0.7967, "step": 4770 }, { "epoch": 0.560440848868566, "grad_norm": 1.4272518157958984, "learning_rate": 9.191208861262967e-06, "loss": 0.7875, "step": 4780 }, { "epoch": 0.5616133192636886, "grad_norm": 1.0465847253799438, "learning_rate": 9.187844932333069e-06, "loss": 0.7328, "step": 4790 }, { "epoch": 0.5627857896588111, "grad_norm": 1.1061692237854004, "learning_rate": 9.184474640536181e-06, "loss": 0.7208, "step": 4800 }, { "epoch": 0.5639582600539337, "grad_norm": 0.9398561120033264, "learning_rate": 9.181097990993011e-06, "loss": 0.7126, "step": 4810 }, { "epoch": 0.5651307304490562, "grad_norm": 1.008276343345642, "learning_rate": 9.177714988833918e-06, "loss": 0.7639, "step": 4820 }, { "epoch": 0.5663032008441787, "grad_norm": 0.9404487609863281, "learning_rate": 9.174325639198927e-06, "loss": 0.7997, "step": 4830 }, { "epoch": 0.5674756712393012, "grad_norm": 1.0622434616088867, "learning_rate": 9.17092994723769e-06, "loss": 0.786, "step": 4840 }, { "epoch": 0.5686481416344237, "grad_norm": 1.0083489418029785, "learning_rate": 9.167527918109512e-06, "loss": 0.687, "step": 4850 }, { "epoch": 0.5698206120295463, "grad_norm": 1.0290510654449463, "learning_rate": 9.16411955698331e-06, "loss": 0.7144, "step": 4860 }, { "epoch": 0.5709930824246687, "grad_norm": 1.0405746698379517, "learning_rate": 9.16070486903764e-06, "loss": 0.7737, "step": 4870 }, { "epoch": 0.5721655528197913, "grad_norm": 1.0497112274169922, "learning_rate": 9.157283859460653e-06, "loss": 0.671, "step": 4880 }, { "epoch": 0.5733380232149138, "grad_norm": 1.2931194305419922, "learning_rate": 9.153856533450115e-06, "loss": 0.7407, "step": 4890 }, { "epoch": 0.5745104936100364, "grad_norm": 1.0574007034301758, "learning_rate": 9.15042289621339e-06, "loss": 0.7127, "step": 4900 }, { "epoch": 0.5756829640051588, "grad_norm": 0.9745653867721558, "learning_rate": 9.146982952967424e-06, "loss": 0.7971, "step": 4910 }, { "epoch": 0.5768554344002814, "grad_norm": 0.9327507019042969, "learning_rate": 9.14353670893875e-06, "loss": 0.7825, "step": 4920 }, { "epoch": 0.5780279047954039, "grad_norm": 1.1349756717681885, "learning_rate": 9.14008416936347e-06, "loss": 0.7569, "step": 4930 }, { "epoch": 0.5792003751905265, "grad_norm": 1.1709768772125244, "learning_rate": 9.136625339487256e-06, "loss": 0.8008, "step": 4940 }, { "epoch": 0.5803728455856489, "grad_norm": 1.1274198293685913, "learning_rate": 9.133160224565334e-06, "loss": 0.7209, "step": 4950 }, { "epoch": 0.5815453159807715, "grad_norm": 1.0164213180541992, "learning_rate": 9.12968882986248e-06, "loss": 0.743, "step": 4960 }, { "epoch": 0.582717786375894, "grad_norm": 1.1000438928604126, "learning_rate": 9.12621116065301e-06, "loss": 0.7699, "step": 4970 }, { "epoch": 0.5838902567710166, "grad_norm": 1.052954077720642, "learning_rate": 9.122727222220775e-06, "loss": 0.865, "step": 4980 }, { "epoch": 0.585062727166139, "grad_norm": 0.9745181798934937, "learning_rate": 9.119237019859154e-06, "loss": 0.7435, "step": 4990 }, { "epoch": 0.5862351975612615, "grad_norm": 1.1727217435836792, "learning_rate": 9.115740558871033e-06, "loss": 0.7172, "step": 5000 }, { "epoch": 0.5862351975612615, "eval_loss": 0.7573840022087097, "eval_model_preparation_time": 0.0, "eval_runtime": 2151.6808, "eval_samples_per_second": 3.524, "eval_steps_per_second": 1.762, "step": 5000 }, { "epoch": 0.5874076679563841, "grad_norm": 0.9012227058410645, "learning_rate": 9.11223784456882e-06, "loss": 0.6864, "step": 5010 }, { "epoch": 0.5885801383515066, "grad_norm": 0.9617934226989746, "learning_rate": 9.108728882274418e-06, "loss": 0.7563, "step": 5020 }, { "epoch": 0.5897526087466292, "grad_norm": 1.0030573606491089, "learning_rate": 9.10521367731922e-06, "loss": 0.7771, "step": 5030 }, { "epoch": 0.5909250791417516, "grad_norm": 0.9411152005195618, "learning_rate": 9.101692235044109e-06, "loss": 0.7429, "step": 5040 }, { "epoch": 0.5920975495368742, "grad_norm": 1.00808846950531, "learning_rate": 9.098164560799442e-06, "loss": 0.7914, "step": 5050 }, { "epoch": 0.5932700199319967, "grad_norm": 1.0736342668533325, "learning_rate": 9.094630659945046e-06, "loss": 0.7581, "step": 5060 }, { "epoch": 0.5944424903271193, "grad_norm": 0.936271071434021, "learning_rate": 9.091090537850207e-06, "loss": 0.7029, "step": 5070 }, { "epoch": 0.5956149607222417, "grad_norm": 1.0912885665893555, "learning_rate": 9.087544199893666e-06, "loss": 0.819, "step": 5080 }, { "epoch": 0.5967874311173643, "grad_norm": 0.9491662383079529, "learning_rate": 9.083991651463606e-06, "loss": 0.7915, "step": 5090 }, { "epoch": 0.5979599015124868, "grad_norm": 1.0669374465942383, "learning_rate": 9.080432897957645e-06, "loss": 0.7495, "step": 5100 }, { "epoch": 0.5991323719076094, "grad_norm": 1.0217573642730713, "learning_rate": 9.07686794478283e-06, "loss": 0.7421, "step": 5110 }, { "epoch": 0.6003048423027318, "grad_norm": 1.023919939994812, "learning_rate": 9.073296797355631e-06, "loss": 0.755, "step": 5120 }, { "epoch": 0.6014773126978544, "grad_norm": 1.172607660293579, "learning_rate": 9.069719461101922e-06, "loss": 0.7461, "step": 5130 }, { "epoch": 0.6026497830929769, "grad_norm": 0.8591760993003845, "learning_rate": 9.066135941456989e-06, "loss": 0.7934, "step": 5140 }, { "epoch": 0.6038222534880995, "grad_norm": 1.0679306983947754, "learning_rate": 9.062546243865503e-06, "loss": 0.751, "step": 5150 }, { "epoch": 0.604994723883222, "grad_norm": 0.99530428647995, "learning_rate": 9.058950373781527e-06, "loss": 0.7537, "step": 5160 }, { "epoch": 0.6061671942783444, "grad_norm": 1.0804294347763062, "learning_rate": 9.055348336668508e-06, "loss": 0.7503, "step": 5170 }, { "epoch": 0.607339664673467, "grad_norm": 1.0501015186309814, "learning_rate": 9.05174013799925e-06, "loss": 0.7013, "step": 5180 }, { "epoch": 0.6085121350685895, "grad_norm": 1.0828946828842163, "learning_rate": 9.048125783255926e-06, "loss": 0.7789, "step": 5190 }, { "epoch": 0.6096846054637121, "grad_norm": 1.0722506046295166, "learning_rate": 9.044505277930065e-06, "loss": 0.7556, "step": 5200 }, { "epoch": 0.6108570758588345, "grad_norm": 1.1026360988616943, "learning_rate": 9.040878627522538e-06, "loss": 0.811, "step": 5210 }, { "epoch": 0.6120295462539571, "grad_norm": 1.1557364463806152, "learning_rate": 9.03724583754355e-06, "loss": 0.8377, "step": 5220 }, { "epoch": 0.6132020166490796, "grad_norm": 1.0658044815063477, "learning_rate": 9.033606913512636e-06, "loss": 0.7397, "step": 5230 }, { "epoch": 0.6143744870442022, "grad_norm": 1.004967451095581, "learning_rate": 9.029961860958655e-06, "loss": 0.767, "step": 5240 }, { "epoch": 0.6155469574393246, "grad_norm": 0.9692328572273254, "learning_rate": 9.02631068541977e-06, "loss": 0.7557, "step": 5250 }, { "epoch": 0.6167194278344472, "grad_norm": 1.172110676765442, "learning_rate": 9.022653392443455e-06, "loss": 0.7572, "step": 5260 }, { "epoch": 0.6178918982295697, "grad_norm": 1.1789487600326538, "learning_rate": 9.018989987586473e-06, "loss": 0.7267, "step": 5270 }, { "epoch": 0.6190643686246923, "grad_norm": 1.0001790523529053, "learning_rate": 9.015320476414873e-06, "loss": 0.7625, "step": 5280 }, { "epoch": 0.6202368390198147, "grad_norm": 1.0745205879211426, "learning_rate": 9.011644864503985e-06, "loss": 0.7271, "step": 5290 }, { "epoch": 0.6214093094149373, "grad_norm": 0.855847179889679, "learning_rate": 9.007963157438408e-06, "loss": 0.6841, "step": 5300 }, { "epoch": 0.6225817798100598, "grad_norm": 1.031787395477295, "learning_rate": 9.004275360811998e-06, "loss": 0.7875, "step": 5310 }, { "epoch": 0.6237542502051823, "grad_norm": 0.978463888168335, "learning_rate": 9.000581480227868e-06, "loss": 0.7174, "step": 5320 }, { "epoch": 0.6249267206003049, "grad_norm": 1.0045160055160522, "learning_rate": 8.996881521298374e-06, "loss": 0.7367, "step": 5330 }, { "epoch": 0.6260991909954273, "grad_norm": 1.1135567426681519, "learning_rate": 8.993175489645102e-06, "loss": 0.7016, "step": 5340 }, { "epoch": 0.6272716613905499, "grad_norm": 1.0286409854888916, "learning_rate": 8.989463390898871e-06, "loss": 0.6861, "step": 5350 }, { "epoch": 0.6284441317856724, "grad_norm": 0.9553893804550171, "learning_rate": 8.985745230699714e-06, "loss": 0.7624, "step": 5360 }, { "epoch": 0.629616602180795, "grad_norm": 1.0253043174743652, "learning_rate": 8.982021014696878e-06, "loss": 0.7564, "step": 5370 }, { "epoch": 0.6307890725759174, "grad_norm": 1.2260591983795166, "learning_rate": 8.978290748548805e-06, "loss": 0.7737, "step": 5380 }, { "epoch": 0.63196154297104, "grad_norm": 1.1754308938980103, "learning_rate": 8.974554437923135e-06, "loss": 0.7779, "step": 5390 }, { "epoch": 0.6331340133661625, "grad_norm": 1.060644507408142, "learning_rate": 8.970812088496688e-06, "loss": 0.778, "step": 5400 }, { "epoch": 0.6343064837612851, "grad_norm": 1.1664294004440308, "learning_rate": 8.96706370595546e-06, "loss": 0.7533, "step": 5410 }, { "epoch": 0.6354789541564075, "grad_norm": 1.0442417860031128, "learning_rate": 8.963309295994617e-06, "loss": 0.7659, "step": 5420 }, { "epoch": 0.6366514245515301, "grad_norm": 1.1139658689498901, "learning_rate": 8.959548864318477e-06, "loss": 0.751, "step": 5430 }, { "epoch": 0.6378238949466526, "grad_norm": 1.0231168270111084, "learning_rate": 8.95578241664051e-06, "loss": 0.6894, "step": 5440 }, { "epoch": 0.6389963653417752, "grad_norm": 1.064772605895996, "learning_rate": 8.952009958683328e-06, "loss": 0.7213, "step": 5450 }, { "epoch": 0.6401688357368976, "grad_norm": 0.9768255352973938, "learning_rate": 8.948231496178671e-06, "loss": 0.7419, "step": 5460 }, { "epoch": 0.6413413061320201, "grad_norm": 0.9219671487808228, "learning_rate": 8.944447034867409e-06, "loss": 0.744, "step": 5470 }, { "epoch": 0.6425137765271427, "grad_norm": 1.0457818508148193, "learning_rate": 8.940656580499518e-06, "loss": 0.7917, "step": 5480 }, { "epoch": 0.6436862469222652, "grad_norm": 1.0001921653747559, "learning_rate": 8.936860138834083e-06, "loss": 0.739, "step": 5490 }, { "epoch": 0.6448587173173878, "grad_norm": 1.194133996963501, "learning_rate": 8.93305771563929e-06, "loss": 0.7612, "step": 5500 }, { "epoch": 0.6448587173173878, "eval_loss": 0.7514063119888306, "eval_model_preparation_time": 0.0, "eval_runtime": 2152.4267, "eval_samples_per_second": 3.523, "eval_steps_per_second": 1.761, "step": 5500 }, { "epoch": 0.6460311877125102, "grad_norm": 1.1081827878952026, "learning_rate": 8.929249316692407e-06, "loss": 0.7344, "step": 5510 }, { "epoch": 0.6472036581076328, "grad_norm": 1.1251354217529297, "learning_rate": 8.925434947779784e-06, "loss": 0.76, "step": 5520 }, { "epoch": 0.6483761285027553, "grad_norm": 1.0125856399536133, "learning_rate": 8.92161461469684e-06, "loss": 0.7916, "step": 5530 }, { "epoch": 0.6495485988978779, "grad_norm": 1.092978835105896, "learning_rate": 8.917788323248063e-06, "loss": 0.7435, "step": 5540 }, { "epoch": 0.6507210692930003, "grad_norm": 1.0860575437545776, "learning_rate": 8.91395607924698e-06, "loss": 0.7862, "step": 5550 }, { "epoch": 0.6518935396881229, "grad_norm": 1.10507333278656, "learning_rate": 8.910117888516177e-06, "loss": 0.704, "step": 5560 }, { "epoch": 0.6530660100832454, "grad_norm": 1.025537371635437, "learning_rate": 8.906273756887263e-06, "loss": 0.7259, "step": 5570 }, { "epoch": 0.654238480478368, "grad_norm": 1.1253817081451416, "learning_rate": 8.902423690200883e-06, "loss": 0.7044, "step": 5580 }, { "epoch": 0.6554109508734904, "grad_norm": 1.01089346408844, "learning_rate": 8.898567694306694e-06, "loss": 0.6953, "step": 5590 }, { "epoch": 0.656583421268613, "grad_norm": 1.113648533821106, "learning_rate": 8.894705775063363e-06, "loss": 0.7321, "step": 5600 }, { "epoch": 0.6577558916637355, "grad_norm": 1.0831019878387451, "learning_rate": 8.890837938338556e-06, "loss": 0.7198, "step": 5610 }, { "epoch": 0.658928362058858, "grad_norm": 1.0872654914855957, "learning_rate": 8.886964190008933e-06, "loss": 0.7595, "step": 5620 }, { "epoch": 0.6601008324539805, "grad_norm": 1.1179250478744507, "learning_rate": 8.883084535960129e-06, "loss": 0.7907, "step": 5630 }, { "epoch": 0.661273302849103, "grad_norm": 0.9997557401657104, "learning_rate": 8.879198982086761e-06, "loss": 0.7625, "step": 5640 }, { "epoch": 0.6624457732442256, "grad_norm": 1.1151928901672363, "learning_rate": 8.875307534292403e-06, "loss": 0.7224, "step": 5650 }, { "epoch": 0.6636182436393481, "grad_norm": 1.0721313953399658, "learning_rate": 8.871410198489588e-06, "loss": 0.7617, "step": 5660 }, { "epoch": 0.6647907140344707, "grad_norm": 0.9067814946174622, "learning_rate": 8.86750698059979e-06, "loss": 0.7859, "step": 5670 }, { "epoch": 0.6659631844295931, "grad_norm": 1.0075325965881348, "learning_rate": 8.863597886553427e-06, "loss": 0.7659, "step": 5680 }, { "epoch": 0.6671356548247157, "grad_norm": 1.1112604141235352, "learning_rate": 8.859682922289843e-06, "loss": 0.622, "step": 5690 }, { "epoch": 0.6683081252198382, "grad_norm": 1.0401099920272827, "learning_rate": 8.855762093757293e-06, "loss": 0.7124, "step": 5700 }, { "epoch": 0.6694805956149608, "grad_norm": 1.0853816270828247, "learning_rate": 8.851835406912954e-06, "loss": 0.7914, "step": 5710 }, { "epoch": 0.6706530660100832, "grad_norm": 1.1077091693878174, "learning_rate": 8.847902867722897e-06, "loss": 0.7156, "step": 5720 }, { "epoch": 0.6718255364052058, "grad_norm": 1.0685842037200928, "learning_rate": 8.843964482162086e-06, "loss": 0.7291, "step": 5730 }, { "epoch": 0.6729980068003283, "grad_norm": 1.180147409439087, "learning_rate": 8.840020256214366e-06, "loss": 0.788, "step": 5740 }, { "epoch": 0.6741704771954509, "grad_norm": 1.496376395225525, "learning_rate": 8.836070195872462e-06, "loss": 0.7731, "step": 5750 }, { "epoch": 0.6753429475905733, "grad_norm": 1.075093388557434, "learning_rate": 8.832114307137958e-06, "loss": 0.7403, "step": 5760 }, { "epoch": 0.6765154179856958, "grad_norm": 1.072293758392334, "learning_rate": 8.828152596021291e-06, "loss": 0.814, "step": 5770 }, { "epoch": 0.6776878883808184, "grad_norm": 0.9755071401596069, "learning_rate": 8.824185068541754e-06, "loss": 0.7985, "step": 5780 }, { "epoch": 0.6788603587759409, "grad_norm": 1.0190602540969849, "learning_rate": 8.820211730727466e-06, "loss": 0.7301, "step": 5790 }, { "epoch": 0.6800328291710634, "grad_norm": 1.1887270212173462, "learning_rate": 8.816232588615382e-06, "loss": 0.706, "step": 5800 }, { "epoch": 0.6812052995661859, "grad_norm": 0.9869251847267151, "learning_rate": 8.812247648251272e-06, "loss": 0.744, "step": 5810 }, { "epoch": 0.6823777699613085, "grad_norm": 1.0370802879333496, "learning_rate": 8.808256915689719e-06, "loss": 0.7742, "step": 5820 }, { "epoch": 0.683550240356431, "grad_norm": 1.0817986726760864, "learning_rate": 8.804260396994102e-06, "loss": 0.7659, "step": 5830 }, { "epoch": 0.6847227107515536, "grad_norm": 1.157869577407837, "learning_rate": 8.800258098236594e-06, "loss": 0.7574, "step": 5840 }, { "epoch": 0.685895181146676, "grad_norm": 1.314330816268921, "learning_rate": 8.79625002549815e-06, "loss": 0.7455, "step": 5850 }, { "epoch": 0.6870676515417986, "grad_norm": 1.2318965196609497, "learning_rate": 8.792236184868495e-06, "loss": 0.7359, "step": 5860 }, { "epoch": 0.6882401219369211, "grad_norm": 1.034080982208252, "learning_rate": 8.788216582446125e-06, "loss": 0.7924, "step": 5870 }, { "epoch": 0.6894125923320437, "grad_norm": 0.9761860370635986, "learning_rate": 8.78419122433828e-06, "loss": 0.6943, "step": 5880 }, { "epoch": 0.6905850627271661, "grad_norm": 1.2477350234985352, "learning_rate": 8.780160116660952e-06, "loss": 0.7712, "step": 5890 }, { "epoch": 0.6917575331222887, "grad_norm": 1.1495983600616455, "learning_rate": 8.776123265538868e-06, "loss": 0.7216, "step": 5900 }, { "epoch": 0.6929300035174112, "grad_norm": 1.1491771936416626, "learning_rate": 8.772080677105479e-06, "loss": 0.6998, "step": 5910 }, { "epoch": 0.6941024739125337, "grad_norm": 1.0527609586715698, "learning_rate": 8.768032357502953e-06, "loss": 0.7889, "step": 5920 }, { "epoch": 0.6952749443076562, "grad_norm": 0.8945150971412659, "learning_rate": 8.76397831288217e-06, "loss": 0.7133, "step": 5930 }, { "epoch": 0.6964474147027787, "grad_norm": 0.982228696346283, "learning_rate": 8.759918549402703e-06, "loss": 0.7113, "step": 5940 }, { "epoch": 0.6976198850979013, "grad_norm": 1.0868371725082397, "learning_rate": 8.75585307323282e-06, "loss": 0.75, "step": 5950 }, { "epoch": 0.6987923554930238, "grad_norm": 1.0014392137527466, "learning_rate": 8.75178189054946e-06, "loss": 0.703, "step": 5960 }, { "epoch": 0.6999648258881463, "grad_norm": 1.2044895887374878, "learning_rate": 8.74770500753824e-06, "loss": 0.7444, "step": 5970 }, { "epoch": 0.7011372962832688, "grad_norm": 1.1593577861785889, "learning_rate": 8.743622430393435e-06, "loss": 0.7279, "step": 5980 }, { "epoch": 0.7023097666783914, "grad_norm": 1.1383157968521118, "learning_rate": 8.739534165317973e-06, "loss": 0.7955, "step": 5990 }, { "epoch": 0.7034822370735139, "grad_norm": 1.1744496822357178, "learning_rate": 8.735440218523418e-06, "loss": 0.7281, "step": 6000 }, { "epoch": 0.7034822370735139, "eval_loss": 0.7453180551528931, "eval_model_preparation_time": 0.0, "eval_runtime": 2147.3171, "eval_samples_per_second": 3.531, "eval_steps_per_second": 1.765, "step": 6000 }, { "epoch": 0.7046547074686365, "grad_norm": 1.3264436721801758, "learning_rate": 8.73134059622998e-06, "loss": 0.794, "step": 6010 }, { "epoch": 0.7058271778637589, "grad_norm": 1.092395305633545, "learning_rate": 8.727235304666476e-06, "loss": 0.7145, "step": 6020 }, { "epoch": 0.7069996482588815, "grad_norm": 1.147038221359253, "learning_rate": 8.723124350070347e-06, "loss": 0.7269, "step": 6030 }, { "epoch": 0.708172118654004, "grad_norm": 1.028171181678772, "learning_rate": 8.719007738687636e-06, "loss": 0.7244, "step": 6040 }, { "epoch": 0.7093445890491266, "grad_norm": 0.9882946014404297, "learning_rate": 8.714885476772978e-06, "loss": 0.8128, "step": 6050 }, { "epoch": 0.710517059444249, "grad_norm": 1.1070884466171265, "learning_rate": 8.710757570589598e-06, "loss": 0.7464, "step": 6060 }, { "epoch": 0.7116895298393715, "grad_norm": 1.0811313390731812, "learning_rate": 8.706624026409294e-06, "loss": 0.7222, "step": 6070 }, { "epoch": 0.7128620002344941, "grad_norm": 1.0947617292404175, "learning_rate": 8.70248485051243e-06, "loss": 0.7623, "step": 6080 }, { "epoch": 0.7140344706296166, "grad_norm": 0.9939017295837402, "learning_rate": 8.698340049187924e-06, "loss": 0.7652, "step": 6090 }, { "epoch": 0.7152069410247391, "grad_norm": 1.1304669380187988, "learning_rate": 8.694189628733247e-06, "loss": 0.7151, "step": 6100 }, { "epoch": 0.7163794114198616, "grad_norm": 1.1380646228790283, "learning_rate": 8.690033595454404e-06, "loss": 0.7578, "step": 6110 }, { "epoch": 0.7175518818149842, "grad_norm": 1.1376657485961914, "learning_rate": 8.685871955665927e-06, "loss": 0.746, "step": 6120 }, { "epoch": 0.7187243522101067, "grad_norm": 0.9364240765571594, "learning_rate": 8.681704715690871e-06, "loss": 0.7438, "step": 6130 }, { "epoch": 0.7198968226052292, "grad_norm": 1.1062755584716797, "learning_rate": 8.677531881860795e-06, "loss": 0.8278, "step": 6140 }, { "epoch": 0.7210692930003517, "grad_norm": 1.1436127424240112, "learning_rate": 8.673353460515756e-06, "loss": 0.7821, "step": 6150 }, { "epoch": 0.7222417633954743, "grad_norm": 0.9965627193450928, "learning_rate": 8.669169458004308e-06, "loss": 0.7436, "step": 6160 }, { "epoch": 0.7234142337905968, "grad_norm": 1.0522595643997192, "learning_rate": 8.664979880683476e-06, "loss": 0.8064, "step": 6170 }, { "epoch": 0.7245867041857194, "grad_norm": 1.0924321413040161, "learning_rate": 8.66078473491876e-06, "loss": 0.7912, "step": 6180 }, { "epoch": 0.7257591745808418, "grad_norm": 1.123354196548462, "learning_rate": 8.656584027084121e-06, "loss": 0.7534, "step": 6190 }, { "epoch": 0.7269316449759644, "grad_norm": 1.4195460081100464, "learning_rate": 8.652377763561968e-06, "loss": 0.7242, "step": 6200 }, { "epoch": 0.7281041153710869, "grad_norm": 2.4745702743530273, "learning_rate": 8.648165950743152e-06, "loss": 0.7277, "step": 6210 }, { "epoch": 0.7292765857662094, "grad_norm": 1.1638003587722778, "learning_rate": 8.643948595026959e-06, "loss": 0.7604, "step": 6220 }, { "epoch": 0.7304490561613319, "grad_norm": 1.4666173458099365, "learning_rate": 8.639725702821092e-06, "loss": 0.7723, "step": 6230 }, { "epoch": 0.7316215265564544, "grad_norm": 1.054829478263855, "learning_rate": 8.635497280541665e-06, "loss": 0.709, "step": 6240 }, { "epoch": 0.732793996951577, "grad_norm": 1.0644621849060059, "learning_rate": 8.6312633346132e-06, "loss": 0.7507, "step": 6250 }, { "epoch": 0.7339664673466995, "grad_norm": 0.9962634444236755, "learning_rate": 8.627023871468608e-06, "loss": 0.6816, "step": 6260 }, { "epoch": 0.735138937741822, "grad_norm": 1.032974362373352, "learning_rate": 8.622778897549182e-06, "loss": 0.664, "step": 6270 }, { "epoch": 0.7363114081369445, "grad_norm": 1.1572370529174805, "learning_rate": 8.61852841930459e-06, "loss": 0.7138, "step": 6280 }, { "epoch": 0.7374838785320671, "grad_norm": 1.0739747285842896, "learning_rate": 8.61427244319286e-06, "loss": 0.7853, "step": 6290 }, { "epoch": 0.7386563489271896, "grad_norm": 1.1930303573608398, "learning_rate": 8.610010975680377e-06, "loss": 0.7599, "step": 6300 }, { "epoch": 0.7398288193223121, "grad_norm": 1.1599684953689575, "learning_rate": 8.605744023241864e-06, "loss": 0.7259, "step": 6310 }, { "epoch": 0.7410012897174346, "grad_norm": 1.0961090326309204, "learning_rate": 8.601471592360387e-06, "loss": 0.7403, "step": 6320 }, { "epoch": 0.7421737601125572, "grad_norm": 1.2491276264190674, "learning_rate": 8.597193689527325e-06, "loss": 0.8182, "step": 6330 }, { "epoch": 0.7433462305076797, "grad_norm": 1.0940735340118408, "learning_rate": 8.592910321242378e-06, "loss": 0.7283, "step": 6340 }, { "epoch": 0.7445187009028023, "grad_norm": 1.1823140382766724, "learning_rate": 8.588621494013549e-06, "loss": 0.7303, "step": 6350 }, { "epoch": 0.7456911712979247, "grad_norm": 1.244482159614563, "learning_rate": 8.584327214357131e-06, "loss": 0.7778, "step": 6360 }, { "epoch": 0.7468636416930472, "grad_norm": 1.109314203262329, "learning_rate": 8.580027488797706e-06, "loss": 0.7908, "step": 6370 }, { "epoch": 0.7480361120881698, "grad_norm": 1.1809508800506592, "learning_rate": 8.57572232386813e-06, "loss": 0.7296, "step": 6380 }, { "epoch": 0.7492085824832923, "grad_norm": 0.9657132625579834, "learning_rate": 8.571411726109518e-06, "loss": 0.709, "step": 6390 }, { "epoch": 0.7503810528784148, "grad_norm": 1.045301914215088, "learning_rate": 8.567095702071247e-06, "loss": 0.7444, "step": 6400 }, { "epoch": 0.7515535232735373, "grad_norm": 1.0968214273452759, "learning_rate": 8.562774258310935e-06, "loss": 0.7361, "step": 6410 }, { "epoch": 0.7527259936686599, "grad_norm": 0.9877039194107056, "learning_rate": 8.558447401394432e-06, "loss": 0.6973, "step": 6420 }, { "epoch": 0.7538984640637824, "grad_norm": 1.079677939414978, "learning_rate": 8.554115137895815e-06, "loss": 0.7158, "step": 6430 }, { "epoch": 0.7550709344589049, "grad_norm": 1.1828813552856445, "learning_rate": 8.549777474397376e-06, "loss": 0.7096, "step": 6440 }, { "epoch": 0.7562434048540274, "grad_norm": 1.0783538818359375, "learning_rate": 8.545434417489615e-06, "loss": 0.79, "step": 6450 }, { "epoch": 0.75741587524915, "grad_norm": 1.1764426231384277, "learning_rate": 8.541085973771217e-06, "loss": 0.7115, "step": 6460 }, { "epoch": 0.7585883456442725, "grad_norm": 1.0958009958267212, "learning_rate": 8.536732149849061e-06, "loss": 0.7142, "step": 6470 }, { "epoch": 0.759760816039395, "grad_norm": 1.2626237869262695, "learning_rate": 8.532372952338194e-06, "loss": 0.7606, "step": 6480 }, { "epoch": 0.7609332864345175, "grad_norm": 1.7458686828613281, "learning_rate": 8.528008387861832e-06, "loss": 0.7058, "step": 6490 }, { "epoch": 0.7621057568296401, "grad_norm": 1.0504740476608276, "learning_rate": 8.523638463051343e-06, "loss": 0.7403, "step": 6500 }, { "epoch": 0.7621057568296401, "eval_loss": 0.739372968673706, "eval_model_preparation_time": 0.0, "eval_runtime": 2150.4667, "eval_samples_per_second": 3.526, "eval_steps_per_second": 1.763, "step": 6500 }, { "epoch": 0.7632782272247626, "grad_norm": 1.175437092781067, "learning_rate": 8.519263184546242e-06, "loss": 0.7455, "step": 6510 }, { "epoch": 0.764450697619885, "grad_norm": 1.0781993865966797, "learning_rate": 8.514882558994174e-06, "loss": 0.7201, "step": 6520 }, { "epoch": 0.7656231680150076, "grad_norm": 1.1888612508773804, "learning_rate": 8.510496593050907e-06, "loss": 0.7648, "step": 6530 }, { "epoch": 0.7667956384101301, "grad_norm": 1.0834509134292603, "learning_rate": 8.506105293380332e-06, "loss": 0.7343, "step": 6540 }, { "epoch": 0.7679681088052527, "grad_norm": 1.1017436981201172, "learning_rate": 8.501708666654436e-06, "loss": 0.7377, "step": 6550 }, { "epoch": 0.7691405792003752, "grad_norm": 1.130162239074707, "learning_rate": 8.4973067195533e-06, "loss": 0.7413, "step": 6560 }, { "epoch": 0.7703130495954977, "grad_norm": 1.0967962741851807, "learning_rate": 8.492899458765094e-06, "loss": 0.7711, "step": 6570 }, { "epoch": 0.7714855199906202, "grad_norm": 1.1959242820739746, "learning_rate": 8.488486890986055e-06, "loss": 0.7465, "step": 6580 }, { "epoch": 0.7726579903857428, "grad_norm": 1.1316769123077393, "learning_rate": 8.484069022920485e-06, "loss": 0.7616, "step": 6590 }, { "epoch": 0.7738304607808653, "grad_norm": 1.0951893329620361, "learning_rate": 8.479645861280745e-06, "loss": 0.6808, "step": 6600 }, { "epoch": 0.7750029311759878, "grad_norm": 1.0406503677368164, "learning_rate": 8.475217412787229e-06, "loss": 0.6862, "step": 6610 }, { "epoch": 0.7761754015711103, "grad_norm": 1.0960148572921753, "learning_rate": 8.470783684168371e-06, "loss": 0.7175, "step": 6620 }, { "epoch": 0.7773478719662329, "grad_norm": 0.9769788980484009, "learning_rate": 8.466344682160628e-06, "loss": 0.7541, "step": 6630 }, { "epoch": 0.7785203423613554, "grad_norm": 1.0977537631988525, "learning_rate": 8.46190041350846e-06, "loss": 0.7337, "step": 6640 }, { "epoch": 0.779692812756478, "grad_norm": 1.0886157751083374, "learning_rate": 8.45745088496434e-06, "loss": 0.7619, "step": 6650 }, { "epoch": 0.7808652831516004, "grad_norm": 1.0459182262420654, "learning_rate": 8.452996103288728e-06, "loss": 0.7457, "step": 6660 }, { "epoch": 0.7820377535467229, "grad_norm": 1.1482539176940918, "learning_rate": 8.448536075250064e-06, "loss": 0.7532, "step": 6670 }, { "epoch": 0.7832102239418455, "grad_norm": 1.0000706911087036, "learning_rate": 8.444070807624759e-06, "loss": 0.7133, "step": 6680 }, { "epoch": 0.784382694336968, "grad_norm": 1.2803868055343628, "learning_rate": 8.43960030719719e-06, "loss": 0.7501, "step": 6690 }, { "epoch": 0.7855551647320905, "grad_norm": 1.2655792236328125, "learning_rate": 8.435124580759679e-06, "loss": 0.7478, "step": 6700 }, { "epoch": 0.786727635127213, "grad_norm": 1.075177788734436, "learning_rate": 8.43064363511249e-06, "loss": 0.7374, "step": 6710 }, { "epoch": 0.7879001055223356, "grad_norm": 1.0643033981323242, "learning_rate": 8.42615747706382e-06, "loss": 0.7678, "step": 6720 }, { "epoch": 0.789072575917458, "grad_norm": 1.277093768119812, "learning_rate": 8.421666113429776e-06, "loss": 0.7059, "step": 6730 }, { "epoch": 0.7902450463125806, "grad_norm": 0.9960077404975891, "learning_rate": 8.417169551034389e-06, "loss": 0.7585, "step": 6740 }, { "epoch": 0.7914175167077031, "grad_norm": 1.1498967409133911, "learning_rate": 8.412667796709574e-06, "loss": 0.7439, "step": 6750 }, { "epoch": 0.7925899871028257, "grad_norm": 1.119436264038086, "learning_rate": 8.408160857295147e-06, "loss": 0.7539, "step": 6760 }, { "epoch": 0.7937624574979482, "grad_norm": 1.189998745918274, "learning_rate": 8.40364873963879e-06, "loss": 0.7374, "step": 6770 }, { "epoch": 0.7949349278930707, "grad_norm": 1.0506916046142578, "learning_rate": 8.399131450596063e-06, "loss": 0.7672, "step": 6780 }, { "epoch": 0.7961073982881932, "grad_norm": 1.1543880701065063, "learning_rate": 8.394608997030378e-06, "loss": 0.7136, "step": 6790 }, { "epoch": 0.7972798686833158, "grad_norm": 1.0325825214385986, "learning_rate": 8.390081385812993e-06, "loss": 0.7721, "step": 6800 }, { "epoch": 0.7984523390784383, "grad_norm": 1.074489712715149, "learning_rate": 8.385548623823007e-06, "loss": 0.7259, "step": 6810 }, { "epoch": 0.7996248094735607, "grad_norm": 1.2747024297714233, "learning_rate": 8.381010717947339e-06, "loss": 0.7011, "step": 6820 }, { "epoch": 0.8007972798686833, "grad_norm": 1.199389100074768, "learning_rate": 8.376467675080732e-06, "loss": 0.774, "step": 6830 }, { "epoch": 0.8019697502638058, "grad_norm": 1.1827434301376343, "learning_rate": 8.371919502125721e-06, "loss": 0.7089, "step": 6840 }, { "epoch": 0.8031422206589284, "grad_norm": 1.1291314363479614, "learning_rate": 8.367366205992649e-06, "loss": 0.7478, "step": 6850 }, { "epoch": 0.8043146910540508, "grad_norm": 1.185679316520691, "learning_rate": 8.362807793599634e-06, "loss": 0.7491, "step": 6860 }, { "epoch": 0.8054871614491734, "grad_norm": 1.222344160079956, "learning_rate": 8.35824427187257e-06, "loss": 0.7333, "step": 6870 }, { "epoch": 0.8066596318442959, "grad_norm": 1.0451596975326538, "learning_rate": 8.353675647745115e-06, "loss": 0.7072, "step": 6880 }, { "epoch": 0.8078321022394185, "grad_norm": 1.1318479776382446, "learning_rate": 8.34910192815868e-06, "loss": 0.811, "step": 6890 }, { "epoch": 0.809004572634541, "grad_norm": 1.148289680480957, "learning_rate": 8.344523120062415e-06, "loss": 0.6903, "step": 6900 }, { "epoch": 0.8101770430296635, "grad_norm": 1.2069355249404907, "learning_rate": 8.339939230413204e-06, "loss": 0.7878, "step": 6910 }, { "epoch": 0.811349513424786, "grad_norm": 1.1785677671432495, "learning_rate": 8.33535026617565e-06, "loss": 0.7644, "step": 6920 }, { "epoch": 0.8125219838199086, "grad_norm": 1.0032308101654053, "learning_rate": 8.330756234322068e-06, "loss": 0.7469, "step": 6930 }, { "epoch": 0.8136944542150311, "grad_norm": 1.1309210062026978, "learning_rate": 8.32615714183247e-06, "loss": 0.7198, "step": 6940 }, { "epoch": 0.8148669246101536, "grad_norm": 1.0625702142715454, "learning_rate": 8.321552995694558e-06, "loss": 0.7471, "step": 6950 }, { "epoch": 0.8160393950052761, "grad_norm": 1.0470114946365356, "learning_rate": 8.316943802903714e-06, "loss": 0.7762, "step": 6960 }, { "epoch": 0.8172118654003986, "grad_norm": 0.9930213093757629, "learning_rate": 8.312329570462986e-06, "loss": 0.7025, "step": 6970 }, { "epoch": 0.8183843357955212, "grad_norm": 3.252241611480713, "learning_rate": 8.307710305383078e-06, "loss": 0.7254, "step": 6980 }, { "epoch": 0.8195568061906436, "grad_norm": 0.9109893441200256, "learning_rate": 8.303086014682344e-06, "loss": 0.6865, "step": 6990 }, { "epoch": 0.8207292765857662, "grad_norm": 1.0866971015930176, "learning_rate": 8.29845670538677e-06, "loss": 0.7925, "step": 7000 }, { "epoch": 0.8207292765857662, "eval_loss": 0.734103798866272, "eval_model_preparation_time": 0.0, "eval_runtime": 2148.8377, "eval_samples_per_second": 3.528, "eval_steps_per_second": 1.764, "step": 7000 }, { "epoch": 0.8219017469808887, "grad_norm": 1.018980860710144, "learning_rate": 8.293822384529973e-06, "loss": 0.6941, "step": 7010 }, { "epoch": 0.8230742173760113, "grad_norm": 1.0894968509674072, "learning_rate": 8.289183059153171e-06, "loss": 0.7516, "step": 7020 }, { "epoch": 0.8242466877711337, "grad_norm": 1.029783844947815, "learning_rate": 8.284538736305203e-06, "loss": 0.724, "step": 7030 }, { "epoch": 0.8254191581662563, "grad_norm": 1.1236716508865356, "learning_rate": 8.279889423042491e-06, "loss": 0.7049, "step": 7040 }, { "epoch": 0.8265916285613788, "grad_norm": 1.2002010345458984, "learning_rate": 8.27523512642904e-06, "loss": 0.711, "step": 7050 }, { "epoch": 0.8277640989565014, "grad_norm": 1.2382994890213013, "learning_rate": 8.270575853536427e-06, "loss": 0.7077, "step": 7060 }, { "epoch": 0.8289365693516239, "grad_norm": 1.2816288471221924, "learning_rate": 8.26591161144379e-06, "loss": 0.7491, "step": 7070 }, { "epoch": 0.8301090397467464, "grad_norm": 1.0516186952590942, "learning_rate": 8.26124240723782e-06, "loss": 0.7098, "step": 7080 }, { "epoch": 0.8312815101418689, "grad_norm": 1.0401452779769897, "learning_rate": 8.256568248012742e-06, "loss": 0.7066, "step": 7090 }, { "epoch": 0.8324539805369915, "grad_norm": 1.1412122249603271, "learning_rate": 8.251889140870313e-06, "loss": 0.7626, "step": 7100 }, { "epoch": 0.833626450932114, "grad_norm": 1.295230746269226, "learning_rate": 8.247205092919807e-06, "loss": 0.7151, "step": 7110 }, { "epoch": 0.8347989213272365, "grad_norm": 1.2383297681808472, "learning_rate": 8.242516111278006e-06, "loss": 0.7788, "step": 7120 }, { "epoch": 0.835971391722359, "grad_norm": 1.230687141418457, "learning_rate": 8.237822203069186e-06, "loss": 0.725, "step": 7130 }, { "epoch": 0.8371438621174815, "grad_norm": 1.1574335098266602, "learning_rate": 8.233123375425108e-06, "loss": 0.7152, "step": 7140 }, { "epoch": 0.8383163325126041, "grad_norm": 1.2551031112670898, "learning_rate": 8.228419635485008e-06, "loss": 0.6923, "step": 7150 }, { "epoch": 0.8394888029077265, "grad_norm": 1.150881052017212, "learning_rate": 8.223710990395587e-06, "loss": 0.7808, "step": 7160 }, { "epoch": 0.8406612733028491, "grad_norm": 1.0657992362976074, "learning_rate": 8.218997447310998e-06, "loss": 0.7293, "step": 7170 }, { "epoch": 0.8418337436979716, "grad_norm": 0.977594256401062, "learning_rate": 8.214279013392836e-06, "loss": 0.6746, "step": 7180 }, { "epoch": 0.8430062140930942, "grad_norm": 1.0696020126342773, "learning_rate": 8.209555695810128e-06, "loss": 0.6883, "step": 7190 }, { "epoch": 0.8441786844882166, "grad_norm": 1.0936360359191895, "learning_rate": 8.204827501739314e-06, "loss": 0.7462, "step": 7200 }, { "epoch": 0.8453511548833392, "grad_norm": 0.9869905114173889, "learning_rate": 8.200094438364255e-06, "loss": 0.7324, "step": 7210 }, { "epoch": 0.8465236252784617, "grad_norm": 1.03476881980896, "learning_rate": 8.1953565128762e-06, "loss": 0.698, "step": 7220 }, { "epoch": 0.8476960956735843, "grad_norm": 1.0198603868484497, "learning_rate": 8.190613732473794e-06, "loss": 0.7228, "step": 7230 }, { "epoch": 0.8488685660687068, "grad_norm": 1.1576170921325684, "learning_rate": 8.185866104363049e-06, "loss": 0.6609, "step": 7240 }, { "epoch": 0.8500410364638293, "grad_norm": 1.4077986478805542, "learning_rate": 8.181113635757347e-06, "loss": 0.7847, "step": 7250 }, { "epoch": 0.8512135068589518, "grad_norm": 1.1107035875320435, "learning_rate": 8.176356333877428e-06, "loss": 0.7246, "step": 7260 }, { "epoch": 0.8523859772540744, "grad_norm": 0.9906008243560791, "learning_rate": 8.171594205951374e-06, "loss": 0.7487, "step": 7270 }, { "epoch": 0.8535584476491969, "grad_norm": 1.0066276788711548, "learning_rate": 8.166827259214591e-06, "loss": 0.7405, "step": 7280 }, { "epoch": 0.8547309180443193, "grad_norm": 1.1469391584396362, "learning_rate": 8.162055500909818e-06, "loss": 0.7211, "step": 7290 }, { "epoch": 0.8559033884394419, "grad_norm": 1.0381579399108887, "learning_rate": 8.157278938287099e-06, "loss": 0.7204, "step": 7300 }, { "epoch": 0.8570758588345644, "grad_norm": 1.5525798797607422, "learning_rate": 8.152497578603777e-06, "loss": 0.7661, "step": 7310 }, { "epoch": 0.858248329229687, "grad_norm": 1.1993991136550903, "learning_rate": 8.147711429124487e-06, "loss": 0.7678, "step": 7320 }, { "epoch": 0.8594207996248094, "grad_norm": 1.384130597114563, "learning_rate": 8.142920497121139e-06, "loss": 0.7013, "step": 7330 }, { "epoch": 0.860593270019932, "grad_norm": 1.2548624277114868, "learning_rate": 8.138124789872907e-06, "loss": 0.7514, "step": 7340 }, { "epoch": 0.8617657404150545, "grad_norm": 1.227177381515503, "learning_rate": 8.133324314666224e-06, "loss": 0.7084, "step": 7350 }, { "epoch": 0.8629382108101771, "grad_norm": 1.0578882694244385, "learning_rate": 8.12851907879477e-06, "loss": 0.7379, "step": 7360 }, { "epoch": 0.8641106812052995, "grad_norm": 1.1096757650375366, "learning_rate": 8.123709089559451e-06, "loss": 0.7186, "step": 7370 }, { "epoch": 0.8652831516004221, "grad_norm": 1.349973201751709, "learning_rate": 8.118894354268398e-06, "loss": 0.7536, "step": 7380 }, { "epoch": 0.8664556219955446, "grad_norm": 1.2341082096099854, "learning_rate": 8.114074880236954e-06, "loss": 0.7151, "step": 7390 }, { "epoch": 0.8676280923906672, "grad_norm": 1.0859607458114624, "learning_rate": 8.109250674787663e-06, "loss": 0.7399, "step": 7400 }, { "epoch": 0.8688005627857897, "grad_norm": 1.083253026008606, "learning_rate": 8.104421745250255e-06, "loss": 0.7322, "step": 7410 }, { "epoch": 0.8699730331809122, "grad_norm": 0.9877168536186218, "learning_rate": 8.09958809896164e-06, "loss": 0.6913, "step": 7420 }, { "epoch": 0.8711455035760347, "grad_norm": 1.1507173776626587, "learning_rate": 8.09474974326589e-06, "loss": 0.7319, "step": 7430 }, { "epoch": 0.8723179739711572, "grad_norm": 1.257414698600769, "learning_rate": 8.089906685514239e-06, "loss": 0.7099, "step": 7440 }, { "epoch": 0.8734904443662798, "grad_norm": 1.0389037132263184, "learning_rate": 8.085058933065062e-06, "loss": 0.6792, "step": 7450 }, { "epoch": 0.8746629147614022, "grad_norm": 1.1381036043167114, "learning_rate": 8.080206493283864e-06, "loss": 0.7119, "step": 7460 }, { "epoch": 0.8758353851565248, "grad_norm": 1.0032236576080322, "learning_rate": 8.075349373543277e-06, "loss": 0.6909, "step": 7470 }, { "epoch": 0.8770078555516473, "grad_norm": 0.8912198543548584, "learning_rate": 8.070487581223039e-06, "loss": 0.7212, "step": 7480 }, { "epoch": 0.8781803259467699, "grad_norm": 1.2873060703277588, "learning_rate": 8.06562112370999e-06, "loss": 0.7624, "step": 7490 }, { "epoch": 0.8793527963418923, "grad_norm": 1.0321297645568848, "learning_rate": 8.060750008398058e-06, "loss": 0.7464, "step": 7500 }, { "epoch": 0.8793527963418923, "eval_loss": 0.7283608913421631, "eval_model_preparation_time": 0.0, "eval_runtime": 2144.4321, "eval_samples_per_second": 3.536, "eval_steps_per_second": 1.768, "step": 7500 }, { "epoch": 0.8805252667370149, "grad_norm": 1.1973700523376465, "learning_rate": 8.055874242688247e-06, "loss": 0.7162, "step": 7510 }, { "epoch": 0.8816977371321374, "grad_norm": 1.066755771636963, "learning_rate": 8.050993833988627e-06, "loss": 0.7484, "step": 7520 }, { "epoch": 0.88287020752726, "grad_norm": 1.069413423538208, "learning_rate": 8.046108789714322e-06, "loss": 0.7097, "step": 7530 }, { "epoch": 0.8840426779223824, "grad_norm": 0.9917722940444946, "learning_rate": 8.041219117287502e-06, "loss": 0.755, "step": 7540 }, { "epoch": 0.885215148317505, "grad_norm": 1.0033432245254517, "learning_rate": 8.036324824137362e-06, "loss": 0.7646, "step": 7550 }, { "epoch": 0.8863876187126275, "grad_norm": 1.100006103515625, "learning_rate": 8.03142591770013e-06, "loss": 0.7101, "step": 7560 }, { "epoch": 0.8875600891077501, "grad_norm": 1.2458500862121582, "learning_rate": 8.026522405419024e-06, "loss": 0.7349, "step": 7570 }, { "epoch": 0.8887325595028726, "grad_norm": 1.0773968696594238, "learning_rate": 8.02161429474428e-06, "loss": 0.6723, "step": 7580 }, { "epoch": 0.889905029897995, "grad_norm": 1.1850491762161255, "learning_rate": 8.016701593133109e-06, "loss": 0.7556, "step": 7590 }, { "epoch": 0.8910775002931176, "grad_norm": 1.20777428150177, "learning_rate": 8.0117843080497e-06, "loss": 0.7242, "step": 7600 }, { "epoch": 0.8922499706882401, "grad_norm": 0.9026963114738464, "learning_rate": 8.006862446965207e-06, "loss": 0.7118, "step": 7610 }, { "epoch": 0.8934224410833627, "grad_norm": 1.0720374584197998, "learning_rate": 8.001936017357733e-06, "loss": 0.7205, "step": 7620 }, { "epoch": 0.8945949114784851, "grad_norm": 1.039357304573059, "learning_rate": 7.99700502671233e-06, "loss": 0.6762, "step": 7630 }, { "epoch": 0.8957673818736077, "grad_norm": 1.2924586534500122, "learning_rate": 7.992069482520971e-06, "loss": 0.6686, "step": 7640 }, { "epoch": 0.8969398522687302, "grad_norm": 1.1106328964233398, "learning_rate": 7.987129392282555e-06, "loss": 0.6993, "step": 7650 }, { "epoch": 0.8981123226638528, "grad_norm": 1.0650882720947266, "learning_rate": 7.982184763502879e-06, "loss": 0.6971, "step": 7660 }, { "epoch": 0.8992847930589752, "grad_norm": 1.1755831241607666, "learning_rate": 7.977235603694647e-06, "loss": 0.7819, "step": 7670 }, { "epoch": 0.9004572634540978, "grad_norm": 1.075526475906372, "learning_rate": 7.972281920377437e-06, "loss": 0.676, "step": 7680 }, { "epoch": 0.9016297338492203, "grad_norm": 1.2197345495224, "learning_rate": 7.967323721077707e-06, "loss": 0.7864, "step": 7690 }, { "epoch": 0.9028022042443429, "grad_norm": 0.984853982925415, "learning_rate": 7.962361013328775e-06, "loss": 0.7179, "step": 7700 }, { "epoch": 0.9039746746394653, "grad_norm": 1.002349615097046, "learning_rate": 7.957393804670802e-06, "loss": 0.6901, "step": 7710 }, { "epoch": 0.9051471450345879, "grad_norm": 1.1555958986282349, "learning_rate": 7.9524221026508e-06, "loss": 0.7044, "step": 7720 }, { "epoch": 0.9063196154297104, "grad_norm": 1.4410991668701172, "learning_rate": 7.947445914822598e-06, "loss": 0.8071, "step": 7730 }, { "epoch": 0.9074920858248329, "grad_norm": 1.353641390800476, "learning_rate": 7.942465248746844e-06, "loss": 0.73, "step": 7740 }, { "epoch": 0.9086645562199555, "grad_norm": 1.0242420434951782, "learning_rate": 7.937480111990991e-06, "loss": 0.7407, "step": 7750 }, { "epoch": 0.9098370266150779, "grad_norm": 1.1515642404556274, "learning_rate": 7.932490512129285e-06, "loss": 0.7392, "step": 7760 }, { "epoch": 0.9110094970102005, "grad_norm": 1.1500046253204346, "learning_rate": 7.92749645674275e-06, "loss": 0.6988, "step": 7770 }, { "epoch": 0.912181967405323, "grad_norm": 0.9559960961341858, "learning_rate": 7.922497953419178e-06, "loss": 0.7196, "step": 7780 }, { "epoch": 0.9133544378004456, "grad_norm": 1.1232435703277588, "learning_rate": 7.917495009753132e-06, "loss": 0.6798, "step": 7790 }, { "epoch": 0.914526908195568, "grad_norm": 1.2863346338272095, "learning_rate": 7.9124876333459e-06, "loss": 0.7588, "step": 7800 }, { "epoch": 0.9156993785906906, "grad_norm": 0.964526891708374, "learning_rate": 7.907475831805524e-06, "loss": 0.7357, "step": 7810 }, { "epoch": 0.9168718489858131, "grad_norm": 1.1943424940109253, "learning_rate": 7.902459612746762e-06, "loss": 0.7235, "step": 7820 }, { "epoch": 0.9180443193809357, "grad_norm": 1.0786712169647217, "learning_rate": 7.897438983791081e-06, "loss": 0.6647, "step": 7830 }, { "epoch": 0.9192167897760581, "grad_norm": 0.9419664144515991, "learning_rate": 7.892413952566654e-06, "loss": 0.7053, "step": 7840 }, { "epoch": 0.9203892601711807, "grad_norm": 1.067457914352417, "learning_rate": 7.887384526708339e-06, "loss": 0.76, "step": 7850 }, { "epoch": 0.9215617305663032, "grad_norm": 1.0117429494857788, "learning_rate": 7.882350713857671e-06, "loss": 0.731, "step": 7860 }, { "epoch": 0.9227342009614258, "grad_norm": 1.037227988243103, "learning_rate": 7.877312521662852e-06, "loss": 0.7392, "step": 7870 }, { "epoch": 0.9239066713565482, "grad_norm": 1.0221651792526245, "learning_rate": 7.872269957778739e-06, "loss": 0.7888, "step": 7880 }, { "epoch": 0.9250791417516707, "grad_norm": 1.2074806690216064, "learning_rate": 7.867223029866827e-06, "loss": 0.7636, "step": 7890 }, { "epoch": 0.9262516121467933, "grad_norm": 1.0669277906417847, "learning_rate": 7.862171745595244e-06, "loss": 0.6901, "step": 7900 }, { "epoch": 0.9274240825419158, "grad_norm": 1.0118695497512817, "learning_rate": 7.857116112638741e-06, "loss": 0.6588, "step": 7910 }, { "epoch": 0.9285965529370384, "grad_norm": 1.0990993976593018, "learning_rate": 7.852056138678667e-06, "loss": 0.7237, "step": 7920 }, { "epoch": 0.9297690233321608, "grad_norm": 1.0507409572601318, "learning_rate": 7.846991831402975e-06, "loss": 0.739, "step": 7930 }, { "epoch": 0.9309414937272834, "grad_norm": 1.2201539278030396, "learning_rate": 7.841923198506196e-06, "loss": 0.6735, "step": 7940 }, { "epoch": 0.9321139641224059, "grad_norm": 1.1401511430740356, "learning_rate": 7.836850247689438e-06, "loss": 0.75, "step": 7950 }, { "epoch": 0.9332864345175285, "grad_norm": 1.105885624885559, "learning_rate": 7.831772986660366e-06, "loss": 0.7046, "step": 7960 }, { "epoch": 0.9344589049126509, "grad_norm": 1.0955404043197632, "learning_rate": 7.826691423133197e-06, "loss": 0.7481, "step": 7970 }, { "epoch": 0.9356313753077735, "grad_norm": 1.119033694267273, "learning_rate": 7.821605564828681e-06, "loss": 0.7298, "step": 7980 }, { "epoch": 0.936803845702896, "grad_norm": 1.1985660791397095, "learning_rate": 7.816515419474095e-06, "loss": 0.6959, "step": 7990 }, { "epoch": 0.9379763160980186, "grad_norm": 1.2488900423049927, "learning_rate": 7.811420994803235e-06, "loss": 0.7594, "step": 8000 }, { "epoch": 0.9379763160980186, "eval_loss": 0.7228077054023743, "eval_model_preparation_time": 0.0, "eval_runtime": 2144.7663, "eval_samples_per_second": 3.535, "eval_steps_per_second": 1.768, "step": 8000 }, { "epoch": 0.939148786493141, "grad_norm": 1.3526009321212769, "learning_rate": 7.806322298556387e-06, "loss": 0.7443, "step": 8010 }, { "epoch": 0.9403212568882636, "grad_norm": 1.5920106172561646, "learning_rate": 7.801219338480338e-06, "loss": 0.7391, "step": 8020 }, { "epoch": 0.9414937272833861, "grad_norm": 1.2067146301269531, "learning_rate": 7.79611212232835e-06, "loss": 0.7041, "step": 8030 }, { "epoch": 0.9426661976785086, "grad_norm": 1.1234793663024902, "learning_rate": 7.791000657860148e-06, "loss": 0.7528, "step": 8040 }, { "epoch": 0.9438386680736311, "grad_norm": 1.033103108406067, "learning_rate": 7.785884952841918e-06, "loss": 0.6775, "step": 8050 }, { "epoch": 0.9450111384687536, "grad_norm": 1.1658815145492554, "learning_rate": 7.780765015046281e-06, "loss": 0.7247, "step": 8060 }, { "epoch": 0.9461836088638762, "grad_norm": 1.0392357110977173, "learning_rate": 7.775640852252298e-06, "loss": 0.7472, "step": 8070 }, { "epoch": 0.9473560792589987, "grad_norm": 1.2836512327194214, "learning_rate": 7.770512472245445e-06, "loss": 0.7358, "step": 8080 }, { "epoch": 0.9485285496541213, "grad_norm": 1.048248291015625, "learning_rate": 7.765379882817604e-06, "loss": 0.7456, "step": 8090 }, { "epoch": 0.9497010200492437, "grad_norm": 1.1926636695861816, "learning_rate": 7.76024309176705e-06, "loss": 0.7143, "step": 8100 }, { "epoch": 0.9508734904443663, "grad_norm": 1.228818655014038, "learning_rate": 7.755102106898455e-06, "loss": 0.7173, "step": 8110 }, { "epoch": 0.9520459608394888, "grad_norm": 1.1739641427993774, "learning_rate": 7.749956936022847e-06, "loss": 0.7298, "step": 8120 }, { "epoch": 0.9532184312346114, "grad_norm": 1.0658782720565796, "learning_rate": 7.744807586957622e-06, "loss": 0.7074, "step": 8130 }, { "epoch": 0.9543909016297338, "grad_norm": 1.3338888883590698, "learning_rate": 7.739654067526526e-06, "loss": 0.7619, "step": 8140 }, { "epoch": 0.9555633720248564, "grad_norm": 1.1174508333206177, "learning_rate": 7.734496385559634e-06, "loss": 0.7384, "step": 8150 }, { "epoch": 0.9567358424199789, "grad_norm": 1.3348311185836792, "learning_rate": 7.729334548893354e-06, "loss": 0.729, "step": 8160 }, { "epoch": 0.9579083128151015, "grad_norm": 1.2442865371704102, "learning_rate": 7.724168565370403e-06, "loss": 0.6961, "step": 8170 }, { "epoch": 0.9590807832102239, "grad_norm": 1.3968628644943237, "learning_rate": 7.718998442839794e-06, "loss": 0.7455, "step": 8180 }, { "epoch": 0.9602532536053464, "grad_norm": 1.108886480331421, "learning_rate": 7.713824189156833e-06, "loss": 0.6927, "step": 8190 }, { "epoch": 0.961425724000469, "grad_norm": 1.0728874206542969, "learning_rate": 7.708645812183107e-06, "loss": 0.716, "step": 8200 }, { "epoch": 0.9625981943955915, "grad_norm": 1.1022753715515137, "learning_rate": 7.703463319786458e-06, "loss": 0.7291, "step": 8210 }, { "epoch": 0.963770664790714, "grad_norm": 1.1198627948760986, "learning_rate": 7.698276719840989e-06, "loss": 0.7561, "step": 8220 }, { "epoch": 0.9649431351858365, "grad_norm": 1.120421051979065, "learning_rate": 7.69308602022704e-06, "loss": 0.676, "step": 8230 }, { "epoch": 0.9661156055809591, "grad_norm": 0.988211452960968, "learning_rate": 7.687891228831179e-06, "loss": 0.7451, "step": 8240 }, { "epoch": 0.9672880759760816, "grad_norm": 1.2442445755004883, "learning_rate": 7.682692353546197e-06, "loss": 0.7409, "step": 8250 }, { "epoch": 0.9684605463712042, "grad_norm": 1.1342089176177979, "learning_rate": 7.677489402271082e-06, "loss": 0.76, "step": 8260 }, { "epoch": 0.9696330167663266, "grad_norm": 1.3257917165756226, "learning_rate": 7.672282382911019e-06, "loss": 0.6718, "step": 8270 }, { "epoch": 0.9708054871614492, "grad_norm": 1.3876031637191772, "learning_rate": 7.667071303377375e-06, "loss": 0.68, "step": 8280 }, { "epoch": 0.9719779575565717, "grad_norm": 1.1228652000427246, "learning_rate": 7.661856171587682e-06, "loss": 0.6708, "step": 8290 }, { "epoch": 0.9731504279516943, "grad_norm": 1.0944052934646606, "learning_rate": 7.656636995465634e-06, "loss": 0.7121, "step": 8300 }, { "epoch": 0.9743228983468167, "grad_norm": 1.6437556743621826, "learning_rate": 7.651413782941066e-06, "loss": 0.7487, "step": 8310 }, { "epoch": 0.9754953687419393, "grad_norm": 1.1308956146240234, "learning_rate": 7.646186541949945e-06, "loss": 0.6921, "step": 8320 }, { "epoch": 0.9766678391370618, "grad_norm": 1.3445934057235718, "learning_rate": 7.64095528043436e-06, "loss": 0.7536, "step": 8330 }, { "epoch": 0.9778403095321843, "grad_norm": 0.9232298135757446, "learning_rate": 7.635720006342513e-06, "loss": 0.6871, "step": 8340 }, { "epoch": 0.9790127799273068, "grad_norm": 0.9935089945793152, "learning_rate": 7.630480727628696e-06, "loss": 0.6874, "step": 8350 }, { "epoch": 0.9801852503224293, "grad_norm": 1.3799282312393188, "learning_rate": 7.625237452253288e-06, "loss": 0.6678, "step": 8360 }, { "epoch": 0.9813577207175519, "grad_norm": 1.1510642766952515, "learning_rate": 7.619990188182742e-06, "loss": 0.7543, "step": 8370 }, { "epoch": 0.9825301911126744, "grad_norm": 1.1997884511947632, "learning_rate": 7.614738943389569e-06, "loss": 0.7182, "step": 8380 }, { "epoch": 0.983702661507797, "grad_norm": 1.2337881326675415, "learning_rate": 7.609483725852329e-06, "loss": 0.6618, "step": 8390 }, { "epoch": 0.9848751319029194, "grad_norm": 1.2841765880584717, "learning_rate": 7.604224543555619e-06, "loss": 0.7693, "step": 8400 }, { "epoch": 0.986047602298042, "grad_norm": 1.1200889348983765, "learning_rate": 7.598961404490056e-06, "loss": 0.7208, "step": 8410 }, { "epoch": 0.9872200726931645, "grad_norm": 1.1618973016738892, "learning_rate": 7.593694316652275e-06, "loss": 0.687, "step": 8420 }, { "epoch": 0.9883925430882871, "grad_norm": 1.3280342817306519, "learning_rate": 7.588423288044906e-06, "loss": 0.7198, "step": 8430 }, { "epoch": 0.9895650134834095, "grad_norm": 1.2371927499771118, "learning_rate": 7.583148326676569e-06, "loss": 0.7027, "step": 8440 }, { "epoch": 0.9907374838785321, "grad_norm": 1.2025104761123657, "learning_rate": 7.577869440561856e-06, "loss": 0.7538, "step": 8450 }, { "epoch": 0.9919099542736546, "grad_norm": 1.2035717964172363, "learning_rate": 7.572586637721327e-06, "loss": 0.7037, "step": 8460 }, { "epoch": 0.9930824246687772, "grad_norm": 1.3028098344802856, "learning_rate": 7.567299926181488e-06, "loss": 0.7138, "step": 8470 }, { "epoch": 0.9942548950638996, "grad_norm": 1.289879322052002, "learning_rate": 7.5620093139747875e-06, "loss": 0.7528, "step": 8480 }, { "epoch": 0.9954273654590221, "grad_norm": 1.1823586225509644, "learning_rate": 7.5567148091395985e-06, "loss": 0.7258, "step": 8490 }, { "epoch": 0.9965998358541447, "grad_norm": 1.1352040767669678, "learning_rate": 7.551416419720208e-06, "loss": 0.7046, "step": 8500 }, { "epoch": 0.9965998358541447, "eval_loss": 0.7177925109863281, "eval_model_preparation_time": 0.0, "eval_runtime": 2147.9394, "eval_samples_per_second": 3.53, "eval_steps_per_second": 1.765, "step": 8500 }, { "epoch": 0.9977723062492672, "grad_norm": 1.0639647245407104, "learning_rate": 7.546114153766806e-06, "loss": 0.7153, "step": 8510 }, { "epoch": 0.9989447766443897, "grad_norm": 1.068735957145691, "learning_rate": 7.540808019335475e-06, "loss": 0.7072, "step": 8520 }, { "epoch": 1.0001172470395123, "grad_norm": 0.9937713146209717, "learning_rate": 7.5354980244881685e-06, "loss": 0.7305, "step": 8530 }, { "epoch": 1.0012897174346347, "grad_norm": 1.516403317451477, "learning_rate": 7.530184177292712e-06, "loss": 0.4814, "step": 8540 }, { "epoch": 1.0024621878297573, "grad_norm": 1.263933777809143, "learning_rate": 7.524866485822779e-06, "loss": 0.5644, "step": 8550 }, { "epoch": 1.0036346582248798, "grad_norm": 1.260459303855896, "learning_rate": 7.519544958157889e-06, "loss": 0.5487, "step": 8560 }, { "epoch": 1.0048071286200024, "grad_norm": 1.2482556104660034, "learning_rate": 7.514219602383388e-06, "loss": 0.5275, "step": 8570 }, { "epoch": 1.0059795990151248, "grad_norm": 1.3634369373321533, "learning_rate": 7.508890426590434e-06, "loss": 0.4966, "step": 8580 }, { "epoch": 1.0071520694102474, "grad_norm": 1.0997904539108276, "learning_rate": 7.503557438875994e-06, "loss": 0.5226, "step": 8590 }, { "epoch": 1.00832453980537, "grad_norm": 1.20207679271698, "learning_rate": 7.498220647342829e-06, "loss": 0.5068, "step": 8600 }, { "epoch": 1.0094970102004925, "grad_norm": 1.096439003944397, "learning_rate": 7.49288006009947e-06, "loss": 0.5316, "step": 8610 }, { "epoch": 1.010669480595615, "grad_norm": 1.1844561100006104, "learning_rate": 7.487535685260225e-06, "loss": 0.573, "step": 8620 }, { "epoch": 1.0118419509907375, "grad_norm": 1.2831557989120483, "learning_rate": 7.482187530945151e-06, "loss": 0.5887, "step": 8630 }, { "epoch": 1.01301442138586, "grad_norm": 1.145118236541748, "learning_rate": 7.476835605280051e-06, "loss": 0.5133, "step": 8640 }, { "epoch": 1.0141868917809824, "grad_norm": 1.2374606132507324, "learning_rate": 7.471479916396451e-06, "loss": 0.5661, "step": 8650 }, { "epoch": 1.015359362176105, "grad_norm": 1.363843560218811, "learning_rate": 7.466120472431606e-06, "loss": 0.5303, "step": 8660 }, { "epoch": 1.0165318325712276, "grad_norm": 1.1260038614273071, "learning_rate": 7.460757281528466e-06, "loss": 0.5323, "step": 8670 }, { "epoch": 1.0177043029663502, "grad_norm": 1.0837939977645874, "learning_rate": 7.4553903518356785e-06, "loss": 0.5092, "step": 8680 }, { "epoch": 1.0188767733614725, "grad_norm": 1.3386188745498657, "learning_rate": 7.450019691507574e-06, "loss": 0.5419, "step": 8690 }, { "epoch": 1.0200492437565951, "grad_norm": 1.3514246940612793, "learning_rate": 7.444645308704145e-06, "loss": 0.5214, "step": 8700 }, { "epoch": 1.0212217141517177, "grad_norm": 1.1440465450286865, "learning_rate": 7.439267211591045e-06, "loss": 0.5446, "step": 8710 }, { "epoch": 1.0223941845468403, "grad_norm": 1.1199647188186646, "learning_rate": 7.433885408339568e-06, "loss": 0.4955, "step": 8720 }, { "epoch": 1.0235666549419626, "grad_norm": 1.1451033353805542, "learning_rate": 7.428499907126641e-06, "loss": 0.5541, "step": 8730 }, { "epoch": 1.0247391253370852, "grad_norm": 1.4245214462280273, "learning_rate": 7.423110716134808e-06, "loss": 0.5606, "step": 8740 }, { "epoch": 1.0259115957322078, "grad_norm": 1.4119879007339478, "learning_rate": 7.417717843552221e-06, "loss": 0.5668, "step": 8750 }, { "epoch": 1.0270840661273304, "grad_norm": 1.0931345224380493, "learning_rate": 7.412321297572621e-06, "loss": 0.4903, "step": 8760 }, { "epoch": 1.0282565365224527, "grad_norm": 1.2480442523956299, "learning_rate": 7.406921086395338e-06, "loss": 0.5352, "step": 8770 }, { "epoch": 1.0294290069175753, "grad_norm": 1.2247905731201172, "learning_rate": 7.401517218225264e-06, "loss": 0.5073, "step": 8780 }, { "epoch": 1.030601477312698, "grad_norm": 1.6661467552185059, "learning_rate": 7.3961097012728485e-06, "loss": 0.5356, "step": 8790 }, { "epoch": 1.0317739477078203, "grad_norm": 1.468617558479309, "learning_rate": 7.3906985437540894e-06, "loss": 0.4799, "step": 8800 }, { "epoch": 1.0329464181029429, "grad_norm": 1.3147664070129395, "learning_rate": 7.3852837538905095e-06, "loss": 0.5114, "step": 8810 }, { "epoch": 1.0341188884980654, "grad_norm": 1.3915245532989502, "learning_rate": 7.379865339909156e-06, "loss": 0.4872, "step": 8820 }, { "epoch": 1.035291358893188, "grad_norm": 1.111879587173462, "learning_rate": 7.3744433100425785e-06, "loss": 0.5131, "step": 8830 }, { "epoch": 1.0364638292883104, "grad_norm": 1.1699422597885132, "learning_rate": 7.369017672528821e-06, "loss": 0.4961, "step": 8840 }, { "epoch": 1.037636299683433, "grad_norm": 1.618423581123352, "learning_rate": 7.363588435611412e-06, "loss": 0.4998, "step": 8850 }, { "epoch": 1.0388087700785555, "grad_norm": 1.240455150604248, "learning_rate": 7.3581556075393445e-06, "loss": 0.5075, "step": 8860 }, { "epoch": 1.0399812404736781, "grad_norm": 1.3238359689712524, "learning_rate": 7.352719196567073e-06, "loss": 0.5333, "step": 8870 }, { "epoch": 1.0411537108688005, "grad_norm": 1.2630120515823364, "learning_rate": 7.34727921095449e-06, "loss": 0.5211, "step": 8880 }, { "epoch": 1.042326181263923, "grad_norm": 1.3064016103744507, "learning_rate": 7.341835658966921e-06, "loss": 0.5125, "step": 8890 }, { "epoch": 1.0434986516590457, "grad_norm": 1.3647042512893677, "learning_rate": 7.336388548875116e-06, "loss": 0.5321, "step": 8900 }, { "epoch": 1.0446711220541682, "grad_norm": 1.2841987609863281, "learning_rate": 7.33093788895522e-06, "loss": 0.5574, "step": 8910 }, { "epoch": 1.0458435924492906, "grad_norm": 1.4742164611816406, "learning_rate": 7.325483687488779e-06, "loss": 0.5626, "step": 8920 }, { "epoch": 1.0470160628444132, "grad_norm": 1.1455574035644531, "learning_rate": 7.3200259527627205e-06, "loss": 0.5696, "step": 8930 }, { "epoch": 1.0481885332395358, "grad_norm": 1.0511691570281982, "learning_rate": 7.314564693069337e-06, "loss": 0.4706, "step": 8940 }, { "epoch": 1.0493610036346581, "grad_norm": 1.4680390357971191, "learning_rate": 7.309099916706277e-06, "loss": 0.5463, "step": 8950 }, { "epoch": 1.0505334740297807, "grad_norm": 1.3190709352493286, "learning_rate": 7.303631631976536e-06, "loss": 0.5034, "step": 8960 }, { "epoch": 1.0517059444249033, "grad_norm": 1.2022627592086792, "learning_rate": 7.298159847188433e-06, "loss": 0.5274, "step": 8970 }, { "epoch": 1.0528784148200259, "grad_norm": 1.3852818012237549, "learning_rate": 7.292684570655611e-06, "loss": 0.5068, "step": 8980 }, { "epoch": 1.0540508852151482, "grad_norm": 1.2620248794555664, "learning_rate": 7.287205810697016e-06, "loss": 0.5088, "step": 8990 }, { "epoch": 1.0552233556102708, "grad_norm": 1.281164526939392, "learning_rate": 7.281723575636887e-06, "loss": 0.4908, "step": 9000 }, { "epoch": 1.0552233556102708, "eval_loss": 0.7292585372924805, "eval_model_preparation_time": 0.0, "eval_runtime": 2146.7747, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.766, "step": 9000 }, { "epoch": 1.0563958260053934, "grad_norm": 1.1545464992523193, "learning_rate": 7.276237873804743e-06, "loss": 0.5324, "step": 9010 }, { "epoch": 1.057568296400516, "grad_norm": 1.190037727355957, "learning_rate": 7.270748713535367e-06, "loss": 0.5027, "step": 9020 }, { "epoch": 1.0587407667956383, "grad_norm": 1.1737440824508667, "learning_rate": 7.265256103168803e-06, "loss": 0.5292, "step": 9030 }, { "epoch": 1.059913237190761, "grad_norm": 1.420189619064331, "learning_rate": 7.259760051050333e-06, "loss": 0.5216, "step": 9040 }, { "epoch": 1.0610857075858835, "grad_norm": 1.3930319547653198, "learning_rate": 7.254260565530466e-06, "loss": 0.5092, "step": 9050 }, { "epoch": 1.062258177981006, "grad_norm": 1.388270616531372, "learning_rate": 7.248757654964934e-06, "loss": 0.5786, "step": 9060 }, { "epoch": 1.0634306483761284, "grad_norm": 1.185646414756775, "learning_rate": 7.243251327714669e-06, "loss": 0.5123, "step": 9070 }, { "epoch": 1.064603118771251, "grad_norm": 1.1558237075805664, "learning_rate": 7.237741592145791e-06, "loss": 0.4957, "step": 9080 }, { "epoch": 1.0657755891663736, "grad_norm": 1.257686734199524, "learning_rate": 7.2322284566296045e-06, "loss": 0.5181, "step": 9090 }, { "epoch": 1.0669480595614962, "grad_norm": 1.4422234296798706, "learning_rate": 7.226711929542579e-06, "loss": 0.5382, "step": 9100 }, { "epoch": 1.0681205299566185, "grad_norm": 1.2248705625534058, "learning_rate": 7.221192019266332e-06, "loss": 0.546, "step": 9110 }, { "epoch": 1.0692930003517411, "grad_norm": 1.1548466682434082, "learning_rate": 7.215668734187625e-06, "loss": 0.5539, "step": 9120 }, { "epoch": 1.0704654707468637, "grad_norm": 1.3112505674362183, "learning_rate": 7.2101420826983505e-06, "loss": 0.5386, "step": 9130 }, { "epoch": 1.071637941141986, "grad_norm": 1.7181991338729858, "learning_rate": 7.2046120731955076e-06, "loss": 0.5443, "step": 9140 }, { "epoch": 1.0728104115371087, "grad_norm": 1.1641637086868286, "learning_rate": 7.1990787140812016e-06, "loss": 0.5109, "step": 9150 }, { "epoch": 1.0739828819322312, "grad_norm": 1.5014824867248535, "learning_rate": 7.19354201376263e-06, "loss": 0.5155, "step": 9160 }, { "epoch": 1.0751553523273538, "grad_norm": 1.1973650455474854, "learning_rate": 7.1880019806520625e-06, "loss": 0.4984, "step": 9170 }, { "epoch": 1.0763278227224762, "grad_norm": 1.3758183717727661, "learning_rate": 7.182458623166835e-06, "loss": 0.5356, "step": 9180 }, { "epoch": 1.0775002931175988, "grad_norm": 1.34469735622406, "learning_rate": 7.176911949729334e-06, "loss": 0.5297, "step": 9190 }, { "epoch": 1.0786727635127213, "grad_norm": 1.3068010807037354, "learning_rate": 7.171361968766981e-06, "loss": 0.5333, "step": 9200 }, { "epoch": 1.079845233907844, "grad_norm": 1.2618913650512695, "learning_rate": 7.165808688712228e-06, "loss": 0.5226, "step": 9210 }, { "epoch": 1.0810177043029663, "grad_norm": 1.4660327434539795, "learning_rate": 7.160252118002535e-06, "loss": 0.5417, "step": 9220 }, { "epoch": 1.0821901746980889, "grad_norm": 1.3869171142578125, "learning_rate": 7.154692265080366e-06, "loss": 0.5439, "step": 9230 }, { "epoch": 1.0833626450932115, "grad_norm": 1.0917081832885742, "learning_rate": 7.149129138393167e-06, "loss": 0.4531, "step": 9240 }, { "epoch": 1.0845351154883338, "grad_norm": 1.3025816679000854, "learning_rate": 7.1435627463933645e-06, "loss": 0.5119, "step": 9250 }, { "epoch": 1.0857075858834564, "grad_norm": 1.23401939868927, "learning_rate": 7.137993097538337e-06, "loss": 0.5386, "step": 9260 }, { "epoch": 1.086880056278579, "grad_norm": 1.1529276371002197, "learning_rate": 7.13242020029042e-06, "loss": 0.4912, "step": 9270 }, { "epoch": 1.0880525266737016, "grad_norm": 1.4131760597229004, "learning_rate": 7.126844063116881e-06, "loss": 0.5372, "step": 9280 }, { "epoch": 1.089224997068824, "grad_norm": 1.0875740051269531, "learning_rate": 7.1212646944899086e-06, "loss": 0.5073, "step": 9290 }, { "epoch": 1.0903974674639465, "grad_norm": 1.0204997062683105, "learning_rate": 7.115682102886606e-06, "loss": 0.5103, "step": 9300 }, { "epoch": 1.091569937859069, "grad_norm": 1.133583426475525, "learning_rate": 7.110096296788966e-06, "loss": 0.5039, "step": 9310 }, { "epoch": 1.0927424082541917, "grad_norm": 1.3910170793533325, "learning_rate": 7.1045072846838725e-06, "loss": 0.5652, "step": 9320 }, { "epoch": 1.093914878649314, "grad_norm": 1.6118055582046509, "learning_rate": 7.098915075063077e-06, "loss": 0.5433, "step": 9330 }, { "epoch": 1.0950873490444366, "grad_norm": 1.0546247959136963, "learning_rate": 7.09331967642319e-06, "loss": 0.5043, "step": 9340 }, { "epoch": 1.0962598194395592, "grad_norm": 1.5048503875732422, "learning_rate": 7.087721097265664e-06, "loss": 0.5615, "step": 9350 }, { "epoch": 1.0974322898346818, "grad_norm": 1.2682642936706543, "learning_rate": 7.082119346096792e-06, "loss": 0.5479, "step": 9360 }, { "epoch": 1.0986047602298041, "grad_norm": 1.2750043869018555, "learning_rate": 7.0765144314276765e-06, "loss": 0.5079, "step": 9370 }, { "epoch": 1.0997772306249267, "grad_norm": 1.1179468631744385, "learning_rate": 7.070906361774234e-06, "loss": 0.5082, "step": 9380 }, { "epoch": 1.1009497010200493, "grad_norm": 1.3642414808273315, "learning_rate": 7.065295145657172e-06, "loss": 0.5468, "step": 9390 }, { "epoch": 1.1021221714151719, "grad_norm": 1.2606351375579834, "learning_rate": 7.059680791601976e-06, "loss": 0.5031, "step": 9400 }, { "epoch": 1.1032946418102942, "grad_norm": 1.3101752996444702, "learning_rate": 7.054063308138904e-06, "loss": 0.5257, "step": 9410 }, { "epoch": 1.1044671122054168, "grad_norm": 1.4228172302246094, "learning_rate": 7.048442703802967e-06, "loss": 0.5552, "step": 9420 }, { "epoch": 1.1056395826005394, "grad_norm": 1.249943733215332, "learning_rate": 7.042818987133915e-06, "loss": 0.4907, "step": 9430 }, { "epoch": 1.1068120529956618, "grad_norm": 1.2742209434509277, "learning_rate": 7.03719216667623e-06, "loss": 0.501, "step": 9440 }, { "epoch": 1.1079845233907843, "grad_norm": 1.2385624647140503, "learning_rate": 7.031562250979109e-06, "loss": 0.521, "step": 9450 }, { "epoch": 1.109156993785907, "grad_norm": 1.2256947755813599, "learning_rate": 7.025929248596451e-06, "loss": 0.5371, "step": 9460 }, { "epoch": 1.1103294641810295, "grad_norm": 1.2062854766845703, "learning_rate": 7.020293168086846e-06, "loss": 0.4634, "step": 9470 }, { "epoch": 1.1115019345761519, "grad_norm": 1.2667367458343506, "learning_rate": 7.014654018013559e-06, "loss": 0.491, "step": 9480 }, { "epoch": 1.1126744049712745, "grad_norm": 1.4581176042556763, "learning_rate": 7.009011806944521e-06, "loss": 0.5397, "step": 9490 }, { "epoch": 1.113846875366397, "grad_norm": 1.1719108819961548, "learning_rate": 7.003366543452311e-06, "loss": 0.5079, "step": 9500 }, { "epoch": 1.113846875366397, "eval_loss": 0.7282720804214478, "eval_model_preparation_time": 0.0, "eval_runtime": 2149.2961, "eval_samples_per_second": 3.528, "eval_steps_per_second": 1.764, "step": 9500 }, { "epoch": 1.1150193457615196, "grad_norm": 1.3787137269973755, "learning_rate": 6.997718236114151e-06, "loss": 0.5415, "step": 9510 }, { "epoch": 1.116191816156642, "grad_norm": 1.2169462442398071, "learning_rate": 6.992066893511879e-06, "loss": 0.5038, "step": 9520 }, { "epoch": 1.1173642865517646, "grad_norm": 1.300452470779419, "learning_rate": 6.986412524231954e-06, "loss": 0.5686, "step": 9530 }, { "epoch": 1.1185367569468871, "grad_norm": 1.3124239444732666, "learning_rate": 6.980755136865428e-06, "loss": 0.5273, "step": 9540 }, { "epoch": 1.1197092273420095, "grad_norm": 1.4239732027053833, "learning_rate": 6.9750947400079396e-06, "loss": 0.5109, "step": 9550 }, { "epoch": 1.120881697737132, "grad_norm": 1.314141035079956, "learning_rate": 6.9694313422597e-06, "loss": 0.4784, "step": 9560 }, { "epoch": 1.1220541681322547, "grad_norm": 1.052672028541565, "learning_rate": 6.963764952225483e-06, "loss": 0.5096, "step": 9570 }, { "epoch": 1.1232266385273773, "grad_norm": 1.342708706855774, "learning_rate": 6.958095578514602e-06, "loss": 0.5196, "step": 9580 }, { "epoch": 1.1243991089224996, "grad_norm": 1.590558409690857, "learning_rate": 6.952423229740911e-06, "loss": 0.5094, "step": 9590 }, { "epoch": 1.1255715793176222, "grad_norm": 1.1796523332595825, "learning_rate": 6.94674791452278e-06, "loss": 0.4935, "step": 9600 }, { "epoch": 1.1267440497127448, "grad_norm": 1.2248693704605103, "learning_rate": 6.941069641483085e-06, "loss": 0.5034, "step": 9610 }, { "epoch": 1.1279165201078674, "grad_norm": 1.1683714389801025, "learning_rate": 6.9353884192492e-06, "loss": 0.4477, "step": 9620 }, { "epoch": 1.1290889905029897, "grad_norm": 1.1640831232070923, "learning_rate": 6.929704256452977e-06, "loss": 0.5672, "step": 9630 }, { "epoch": 1.1302614608981123, "grad_norm": 1.255991816520691, "learning_rate": 6.924017161730736e-06, "loss": 0.493, "step": 9640 }, { "epoch": 1.1314339312932349, "grad_norm": 1.2042920589447021, "learning_rate": 6.918327143723255e-06, "loss": 0.5101, "step": 9650 }, { "epoch": 1.1326064016883572, "grad_norm": 1.3028888702392578, "learning_rate": 6.91263421107575e-06, "loss": 0.5095, "step": 9660 }, { "epoch": 1.1337788720834798, "grad_norm": 1.3290109634399414, "learning_rate": 6.906938372437863e-06, "loss": 0.4882, "step": 9670 }, { "epoch": 1.1349513424786024, "grad_norm": 1.2149738073349, "learning_rate": 6.901239636463659e-06, "loss": 0.5283, "step": 9680 }, { "epoch": 1.136123812873725, "grad_norm": 1.1153661012649536, "learning_rate": 6.895538011811599e-06, "loss": 0.4844, "step": 9690 }, { "epoch": 1.1372962832688476, "grad_norm": 1.4468998908996582, "learning_rate": 6.889833507144534e-06, "loss": 0.5208, "step": 9700 }, { "epoch": 1.13846875366397, "grad_norm": 1.152814507484436, "learning_rate": 6.884126131129692e-06, "loss": 0.5415, "step": 9710 }, { "epoch": 1.1396412240590925, "grad_norm": 1.326251745223999, "learning_rate": 6.878415892438662e-06, "loss": 0.491, "step": 9720 }, { "epoch": 1.140813694454215, "grad_norm": 1.2469528913497925, "learning_rate": 6.872702799747384e-06, "loss": 0.4742, "step": 9730 }, { "epoch": 1.1419861648493375, "grad_norm": 1.2661417722702026, "learning_rate": 6.866986861736135e-06, "loss": 0.5124, "step": 9740 }, { "epoch": 1.14315863524446, "grad_norm": 1.196169376373291, "learning_rate": 6.861268087089514e-06, "loss": 0.5358, "step": 9750 }, { "epoch": 1.1443311056395826, "grad_norm": 1.3802281618118286, "learning_rate": 6.855546484496427e-06, "loss": 0.4947, "step": 9760 }, { "epoch": 1.1455035760347052, "grad_norm": 1.2869724035263062, "learning_rate": 6.849822062650082e-06, "loss": 0.5566, "step": 9770 }, { "epoch": 1.1466760464298276, "grad_norm": 1.1336451768875122, "learning_rate": 6.844094830247967e-06, "loss": 0.5422, "step": 9780 }, { "epoch": 1.1478485168249501, "grad_norm": 1.2551791667938232, "learning_rate": 6.838364795991841e-06, "loss": 0.5434, "step": 9790 }, { "epoch": 1.1490209872200727, "grad_norm": 1.7827317714691162, "learning_rate": 6.832631968587721e-06, "loss": 0.5073, "step": 9800 }, { "epoch": 1.1501934576151953, "grad_norm": 1.3491982221603394, "learning_rate": 6.826896356745865e-06, "loss": 0.5342, "step": 9810 }, { "epoch": 1.1513659280103177, "grad_norm": 1.1349127292633057, "learning_rate": 6.8211579691807665e-06, "loss": 0.4865, "step": 9820 }, { "epoch": 1.1525383984054403, "grad_norm": 1.3833693265914917, "learning_rate": 6.815416814611131e-06, "loss": 0.5334, "step": 9830 }, { "epoch": 1.1537108688005628, "grad_norm": 1.2096424102783203, "learning_rate": 6.809672901759872e-06, "loss": 0.5077, "step": 9840 }, { "epoch": 1.1548833391956852, "grad_norm": 1.3103666305541992, "learning_rate": 6.803926239354091e-06, "loss": 0.5741, "step": 9850 }, { "epoch": 1.1560558095908078, "grad_norm": 1.3449252843856812, "learning_rate": 6.798176836125071e-06, "loss": 0.5294, "step": 9860 }, { "epoch": 1.1572282799859304, "grad_norm": 1.273551344871521, "learning_rate": 6.792424700808253e-06, "loss": 0.5419, "step": 9870 }, { "epoch": 1.158400750381053, "grad_norm": 1.2460317611694336, "learning_rate": 6.786669842143236e-06, "loss": 0.5147, "step": 9880 }, { "epoch": 1.1595732207761753, "grad_norm": 1.5269173383712769, "learning_rate": 6.780912268873752e-06, "loss": 0.503, "step": 9890 }, { "epoch": 1.1607456911712979, "grad_norm": 1.3493280410766602, "learning_rate": 6.775151989747659e-06, "loss": 0.5355, "step": 9900 }, { "epoch": 1.1619181615664205, "grad_norm": 1.2978034019470215, "learning_rate": 6.769389013516927e-06, "loss": 0.5198, "step": 9910 }, { "epoch": 1.163090631961543, "grad_norm": 1.1335362195968628, "learning_rate": 6.7636233489376235e-06, "loss": 0.512, "step": 9920 }, { "epoch": 1.1642631023566654, "grad_norm": 1.5701240301132202, "learning_rate": 6.757855004769898e-06, "loss": 0.476, "step": 9930 }, { "epoch": 1.165435572751788, "grad_norm": 1.3240197896957397, "learning_rate": 6.7520839897779756e-06, "loss": 0.5737, "step": 9940 }, { "epoch": 1.1666080431469106, "grad_norm": 1.096248984336853, "learning_rate": 6.746310312730136e-06, "loss": 0.4946, "step": 9950 }, { "epoch": 1.1677805135420332, "grad_norm": 1.2123881578445435, "learning_rate": 6.740533982398705e-06, "loss": 0.4649, "step": 9960 }, { "epoch": 1.1689529839371555, "grad_norm": 1.1349283456802368, "learning_rate": 6.734755007560039e-06, "loss": 0.5306, "step": 9970 }, { "epoch": 1.170125454332278, "grad_norm": 1.2556180953979492, "learning_rate": 6.728973396994515e-06, "loss": 0.4849, "step": 9980 }, { "epoch": 1.1712979247274007, "grad_norm": 1.584905982017517, "learning_rate": 6.72318915948651e-06, "loss": 0.5363, "step": 9990 }, { "epoch": 1.1724703951225233, "grad_norm": 1.3625404834747314, "learning_rate": 6.717402303824395e-06, "loss": 0.5299, "step": 10000 }, { "epoch": 1.1724703951225233, "eval_loss": 0.7230923771858215, "eval_model_preparation_time": 0.0, "eval_runtime": 2147.213, "eval_samples_per_second": 3.531, "eval_steps_per_second": 1.766, "step": 10000 }, { "epoch": 1.1736428655176456, "grad_norm": 1.2899248600006104, "learning_rate": 6.7116128388005194e-06, "loss": 0.5116, "step": 10010 }, { "epoch": 1.1748153359127682, "grad_norm": 1.334876537322998, "learning_rate": 6.705820773211198e-06, "loss": 0.5504, "step": 10020 }, { "epoch": 1.1759878063078908, "grad_norm": 1.1905639171600342, "learning_rate": 6.700026115856693e-06, "loss": 0.4852, "step": 10030 }, { "epoch": 1.1771602767030132, "grad_norm": 1.2982544898986816, "learning_rate": 6.694228875541206e-06, "loss": 0.4497, "step": 10040 }, { "epoch": 1.1783327470981357, "grad_norm": 1.3602935075759888, "learning_rate": 6.6884290610728665e-06, "loss": 0.4824, "step": 10050 }, { "epoch": 1.1795052174932583, "grad_norm": 1.4988796710968018, "learning_rate": 6.682626681263711e-06, "loss": 0.5329, "step": 10060 }, { "epoch": 1.180677687888381, "grad_norm": 1.465499758720398, "learning_rate": 6.676821744929673e-06, "loss": 0.4901, "step": 10070 }, { "epoch": 1.1818501582835033, "grad_norm": 1.2817213535308838, "learning_rate": 6.671014260890577e-06, "loss": 0.5052, "step": 10080 }, { "epoch": 1.1830226286786258, "grad_norm": 1.2806850671768188, "learning_rate": 6.6652042379701095e-06, "loss": 0.5402, "step": 10090 }, { "epoch": 1.1841950990737484, "grad_norm": 1.379896640777588, "learning_rate": 6.65939168499582e-06, "loss": 0.5196, "step": 10100 }, { "epoch": 1.185367569468871, "grad_norm": 1.5992019176483154, "learning_rate": 6.653576610799102e-06, "loss": 0.525, "step": 10110 }, { "epoch": 1.1865400398639934, "grad_norm": 1.2851693630218506, "learning_rate": 6.647759024215178e-06, "loss": 0.4795, "step": 10120 }, { "epoch": 1.187712510259116, "grad_norm": 1.3098911046981812, "learning_rate": 6.641938934083085e-06, "loss": 0.5063, "step": 10130 }, { "epoch": 1.1888849806542385, "grad_norm": 1.3521735668182373, "learning_rate": 6.636116349245672e-06, "loss": 0.5149, "step": 10140 }, { "epoch": 1.190057451049361, "grad_norm": 1.1990143060684204, "learning_rate": 6.630291278549572e-06, "loss": 0.4731, "step": 10150 }, { "epoch": 1.1912299214444835, "grad_norm": 1.3492681980133057, "learning_rate": 6.624463730845192e-06, "loss": 0.4863, "step": 10160 }, { "epoch": 1.192402391839606, "grad_norm": 1.4486918449401855, "learning_rate": 6.618633714986712e-06, "loss": 0.5046, "step": 10170 }, { "epoch": 1.1935748622347286, "grad_norm": 1.268239140510559, "learning_rate": 6.612801239832056e-06, "loss": 0.5025, "step": 10180 }, { "epoch": 1.194747332629851, "grad_norm": 1.4430545568466187, "learning_rate": 6.606966314242882e-06, "loss": 0.4961, "step": 10190 }, { "epoch": 1.1959198030249736, "grad_norm": 1.3396106958389282, "learning_rate": 6.6011289470845765e-06, "loss": 0.4826, "step": 10200 }, { "epoch": 1.1970922734200962, "grad_norm": 1.3400495052337646, "learning_rate": 6.595289147226236e-06, "loss": 0.5358, "step": 10210 }, { "epoch": 1.1982647438152187, "grad_norm": 1.593146562576294, "learning_rate": 6.589446923540648e-06, "loss": 0.5431, "step": 10220 }, { "epoch": 1.199437214210341, "grad_norm": 1.3235979080200195, "learning_rate": 6.583602284904283e-06, "loss": 0.5448, "step": 10230 }, { "epoch": 1.2006096846054637, "grad_norm": 1.3439985513687134, "learning_rate": 6.57775524019729e-06, "loss": 0.4881, "step": 10240 }, { "epoch": 1.2017821550005863, "grad_norm": 1.3184165954589844, "learning_rate": 6.571905798303462e-06, "loss": 0.5353, "step": 10250 }, { "epoch": 1.2029546253957089, "grad_norm": 1.5100394487380981, "learning_rate": 6.56605396811024e-06, "loss": 0.5338, "step": 10260 }, { "epoch": 1.2041270957908312, "grad_norm": 1.3220638036727905, "learning_rate": 6.56019975850869e-06, "loss": 0.4971, "step": 10270 }, { "epoch": 1.2052995661859538, "grad_norm": 1.562718152999878, "learning_rate": 6.554343178393502e-06, "loss": 0.4874, "step": 10280 }, { "epoch": 1.2064720365810764, "grad_norm": 1.2883028984069824, "learning_rate": 6.5484842366629575e-06, "loss": 0.5357, "step": 10290 }, { "epoch": 1.207644506976199, "grad_norm": 1.4356145858764648, "learning_rate": 6.54262294221893e-06, "loss": 0.5698, "step": 10300 }, { "epoch": 1.2088169773713213, "grad_norm": 1.193841576576233, "learning_rate": 6.53675930396687e-06, "loss": 0.5124, "step": 10310 }, { "epoch": 1.209989447766444, "grad_norm": 1.3113006353378296, "learning_rate": 6.530893330815785e-06, "loss": 0.512, "step": 10320 }, { "epoch": 1.2111619181615665, "grad_norm": 1.3491748571395874, "learning_rate": 6.525025031678234e-06, "loss": 0.5216, "step": 10330 }, { "epoch": 1.2123343885566888, "grad_norm": 1.2184332609176636, "learning_rate": 6.519154415470305e-06, "loss": 0.4698, "step": 10340 }, { "epoch": 1.2135068589518114, "grad_norm": 1.2878497838974, "learning_rate": 6.513281491111615e-06, "loss": 0.4993, "step": 10350 }, { "epoch": 1.214679329346934, "grad_norm": 1.2888309955596924, "learning_rate": 6.507406267525276e-06, "loss": 0.5247, "step": 10360 }, { "epoch": 1.2158517997420566, "grad_norm": 1.2348136901855469, "learning_rate": 6.501528753637905e-06, "loss": 0.4771, "step": 10370 }, { "epoch": 1.217024270137179, "grad_norm": 1.3685132265090942, "learning_rate": 6.495648958379591e-06, "loss": 0.5468, "step": 10380 }, { "epoch": 1.2181967405323015, "grad_norm": 1.0698797702789307, "learning_rate": 6.489766890683892e-06, "loss": 0.5087, "step": 10390 }, { "epoch": 1.2193692109274241, "grad_norm": 1.3101304769515991, "learning_rate": 6.48388255948782e-06, "loss": 0.5462, "step": 10400 }, { "epoch": 1.2205416813225467, "grad_norm": 1.5180844068527222, "learning_rate": 6.477995973731823e-06, "loss": 0.5803, "step": 10410 }, { "epoch": 1.221714151717669, "grad_norm": 1.4092459678649902, "learning_rate": 6.472107142359777e-06, "loss": 0.5469, "step": 10420 }, { "epoch": 1.2228866221127916, "grad_norm": 1.418844223022461, "learning_rate": 6.4662160743189686e-06, "loss": 0.478, "step": 10430 }, { "epoch": 1.2240590925079142, "grad_norm": 1.3443547487258911, "learning_rate": 6.460322778560085e-06, "loss": 0.5465, "step": 10440 }, { "epoch": 1.2252315629030366, "grad_norm": 1.1514142751693726, "learning_rate": 6.4544272640371926e-06, "loss": 0.4895, "step": 10450 }, { "epoch": 1.2264040332981592, "grad_norm": 1.331505537033081, "learning_rate": 6.448529539707735e-06, "loss": 0.5403, "step": 10460 }, { "epoch": 1.2275765036932818, "grad_norm": 1.2548574209213257, "learning_rate": 6.442629614532513e-06, "loss": 0.5479, "step": 10470 }, { "epoch": 1.2287489740884043, "grad_norm": 1.3529539108276367, "learning_rate": 6.436727497475666e-06, "loss": 0.4976, "step": 10480 }, { "epoch": 1.229921444483527, "grad_norm": 1.3992316722869873, "learning_rate": 6.430823197504666e-06, "loss": 0.5021, "step": 10490 }, { "epoch": 1.2310939148786493, "grad_norm": 1.1513546705245972, "learning_rate": 6.4249167235903065e-06, "loss": 0.5105, "step": 10500 }, { "epoch": 1.2310939148786493, "eval_loss": 0.7217609882354736, "eval_model_preparation_time": 0.0, "eval_runtime": 2150.6383, "eval_samples_per_second": 3.525, "eval_steps_per_second": 1.763, "step": 10500 }, { "epoch": 1.2322663852737719, "grad_norm": 1.2761465311050415, "learning_rate": 6.419008084706676e-06, "loss": 0.5239, "step": 10510 }, { "epoch": 1.2334388556688944, "grad_norm": 1.2324775457382202, "learning_rate": 6.413097289831158e-06, "loss": 0.4863, "step": 10520 }, { "epoch": 1.2346113260640168, "grad_norm": 1.3752108812332153, "learning_rate": 6.4071843479444116e-06, "loss": 0.5554, "step": 10530 }, { "epoch": 1.2357837964591394, "grad_norm": 1.4458686113357544, "learning_rate": 6.401269268030356e-06, "loss": 0.5239, "step": 10540 }, { "epoch": 1.236956266854262, "grad_norm": 1.5159621238708496, "learning_rate": 6.395352059076156e-06, "loss": 0.524, "step": 10550 }, { "epoch": 1.2381287372493845, "grad_norm": 1.019576072692871, "learning_rate": 6.389432730072219e-06, "loss": 0.4952, "step": 10560 }, { "epoch": 1.239301207644507, "grad_norm": 1.5250908136367798, "learning_rate": 6.383511290012168e-06, "loss": 0.5409, "step": 10570 }, { "epoch": 1.2404736780396295, "grad_norm": 1.4203544855117798, "learning_rate": 6.377587747892833e-06, "loss": 0.461, "step": 10580 }, { "epoch": 1.241646148434752, "grad_norm": 1.4657704830169678, "learning_rate": 6.3716621127142395e-06, "loss": 0.5505, "step": 10590 }, { "epoch": 1.2428186188298747, "grad_norm": 1.3744847774505615, "learning_rate": 6.365734393479595e-06, "loss": 0.5031, "step": 10600 }, { "epoch": 1.243991089224997, "grad_norm": 1.0535110235214233, "learning_rate": 6.35980459919527e-06, "loss": 0.5443, "step": 10610 }, { "epoch": 1.2451635596201196, "grad_norm": 1.4593373537063599, "learning_rate": 6.353872738870787e-06, "loss": 0.4944, "step": 10620 }, { "epoch": 1.2463360300152422, "grad_norm": 1.2568562030792236, "learning_rate": 6.3479388215188135e-06, "loss": 0.5556, "step": 10630 }, { "epoch": 1.2475085004103645, "grad_norm": 1.2667771577835083, "learning_rate": 6.34200285615514e-06, "loss": 0.51, "step": 10640 }, { "epoch": 1.2486809708054871, "grad_norm": 1.301358938217163, "learning_rate": 6.3360648517986605e-06, "loss": 0.5364, "step": 10650 }, { "epoch": 1.2498534412006097, "grad_norm": 1.097075343132019, "learning_rate": 6.330124817471379e-06, "loss": 0.5683, "step": 10660 }, { "epoch": 1.2510259115957323, "grad_norm": 1.3043802976608276, "learning_rate": 6.324182762198379e-06, "loss": 0.4983, "step": 10670 }, { "epoch": 1.2521983819908549, "grad_norm": 1.3721482753753662, "learning_rate": 6.31823869500781e-06, "loss": 0.5382, "step": 10680 }, { "epoch": 1.2533708523859772, "grad_norm": 1.3450685739517212, "learning_rate": 6.312292624930886e-06, "loss": 0.5462, "step": 10690 }, { "epoch": 1.2545433227810998, "grad_norm": 1.3407005071640015, "learning_rate": 6.30634456100186e-06, "loss": 0.5339, "step": 10700 }, { "epoch": 1.2557157931762224, "grad_norm": 1.3235433101654053, "learning_rate": 6.300394512258015e-06, "loss": 0.4733, "step": 10710 }, { "epoch": 1.2568882635713448, "grad_norm": 1.323158621788025, "learning_rate": 6.294442487739647e-06, "loss": 0.5171, "step": 10720 }, { "epoch": 1.2580607339664673, "grad_norm": 1.5862518548965454, "learning_rate": 6.288488496490061e-06, "loss": 0.543, "step": 10730 }, { "epoch": 1.25923320436159, "grad_norm": 1.2289546728134155, "learning_rate": 6.282532547555542e-06, "loss": 0.4701, "step": 10740 }, { "epoch": 1.2604056747567123, "grad_norm": 1.3704835176467896, "learning_rate": 6.276574649985354e-06, "loss": 0.5227, "step": 10750 }, { "epoch": 1.2615781451518349, "grad_norm": 1.1688895225524902, "learning_rate": 6.2706148128317225e-06, "loss": 0.5608, "step": 10760 }, { "epoch": 1.2627506155469574, "grad_norm": 1.200037956237793, "learning_rate": 6.264653045149817e-06, "loss": 0.5211, "step": 10770 }, { "epoch": 1.26392308594208, "grad_norm": 1.229371190071106, "learning_rate": 6.25868935599774e-06, "loss": 0.4846, "step": 10780 }, { "epoch": 1.2650955563372026, "grad_norm": 1.4086061716079712, "learning_rate": 6.252723754436518e-06, "loss": 0.5342, "step": 10790 }, { "epoch": 1.266268026732325, "grad_norm": 1.4190107583999634, "learning_rate": 6.246756249530079e-06, "loss": 0.5578, "step": 10800 }, { "epoch": 1.2674404971274476, "grad_norm": 1.371416449546814, "learning_rate": 6.240786850345243e-06, "loss": 0.5224, "step": 10810 }, { "epoch": 1.2686129675225701, "grad_norm": 1.158677101135254, "learning_rate": 6.234815565951708e-06, "loss": 0.4864, "step": 10820 }, { "epoch": 1.2697854379176925, "grad_norm": 1.0740365982055664, "learning_rate": 6.228842405422041e-06, "loss": 0.4949, "step": 10830 }, { "epoch": 1.270957908312815, "grad_norm": 1.1311559677124023, "learning_rate": 6.2228673778316516e-06, "loss": 0.5589, "step": 10840 }, { "epoch": 1.2721303787079377, "grad_norm": 1.4327701330184937, "learning_rate": 6.2168904922587935e-06, "loss": 0.541, "step": 10850 }, { "epoch": 1.27330284910306, "grad_norm": 1.257973551750183, "learning_rate": 6.210911757784538e-06, "loss": 0.5096, "step": 10860 }, { "epoch": 1.2744753194981826, "grad_norm": 1.2791805267333984, "learning_rate": 6.2049311834927705e-06, "loss": 0.4666, "step": 10870 }, { "epoch": 1.2756477898933052, "grad_norm": 1.4408029317855835, "learning_rate": 6.198948778470167e-06, "loss": 0.5642, "step": 10880 }, { "epoch": 1.2768202602884278, "grad_norm": 1.378873348236084, "learning_rate": 6.192964551806186e-06, "loss": 0.506, "step": 10890 }, { "epoch": 1.2779927306835503, "grad_norm": 1.4418673515319824, "learning_rate": 6.1869785125930605e-06, "loss": 0.5484, "step": 10900 }, { "epoch": 1.2791652010786727, "grad_norm": 1.424781084060669, "learning_rate": 6.180990669925765e-06, "loss": 0.5345, "step": 10910 }, { "epoch": 1.2803376714737953, "grad_norm": 1.1830706596374512, "learning_rate": 6.175001032902024e-06, "loss": 0.4832, "step": 10920 }, { "epoch": 1.2815101418689179, "grad_norm": 1.4402403831481934, "learning_rate": 6.169009610622285e-06, "loss": 0.5692, "step": 10930 }, { "epoch": 1.2826826122640402, "grad_norm": 1.2258265018463135, "learning_rate": 6.163016412189708e-06, "loss": 0.5681, "step": 10940 }, { "epoch": 1.2838550826591628, "grad_norm": 1.3396728038787842, "learning_rate": 6.157021446710152e-06, "loss": 0.5689, "step": 10950 }, { "epoch": 1.2850275530542854, "grad_norm": 1.2663851976394653, "learning_rate": 6.15102472329216e-06, "loss": 0.4737, "step": 10960 }, { "epoch": 1.286200023449408, "grad_norm": 1.2220714092254639, "learning_rate": 6.1450262510469486e-06, "loss": 0.4862, "step": 10970 }, { "epoch": 1.2873724938445306, "grad_norm": 1.283007264137268, "learning_rate": 6.139026039088385e-06, "loss": 0.5446, "step": 10980 }, { "epoch": 1.288544964239653, "grad_norm": 1.3460263013839722, "learning_rate": 6.133024096532989e-06, "loss": 0.5372, "step": 10990 }, { "epoch": 1.2897174346347755, "grad_norm": 1.2551414966583252, "learning_rate": 6.1270204324999014e-06, "loss": 0.5026, "step": 11000 }, { "epoch": 1.2897174346347755, "eval_loss": 0.7176235318183899, "eval_model_preparation_time": 0.0, "eval_runtime": 2144.5661, "eval_samples_per_second": 3.535, "eval_steps_per_second": 1.768, "step": 11000 }, { "epoch": 1.290889905029898, "grad_norm": 1.2289280891418457, "learning_rate": 6.121015056110884e-06, "loss": 0.4946, "step": 11010 }, { "epoch": 1.2920623754250204, "grad_norm": 1.3631515502929688, "learning_rate": 6.115007976490297e-06, "loss": 0.5368, "step": 11020 }, { "epoch": 1.293234845820143, "grad_norm": 1.3701854944229126, "learning_rate": 6.1089992027650916e-06, "loss": 0.4852, "step": 11030 }, { "epoch": 1.2944073162152656, "grad_norm": 1.3369117975234985, "learning_rate": 6.1029887440647875e-06, "loss": 0.503, "step": 11040 }, { "epoch": 1.295579786610388, "grad_norm": 1.1635358333587646, "learning_rate": 6.096976609521472e-06, "loss": 0.471, "step": 11050 }, { "epoch": 1.2967522570055106, "grad_norm": 1.338908076286316, "learning_rate": 6.090962808269774e-06, "loss": 0.4548, "step": 11060 }, { "epoch": 1.2979247274006331, "grad_norm": 1.3267487287521362, "learning_rate": 6.084947349446853e-06, "loss": 0.5578, "step": 11070 }, { "epoch": 1.2990971977957557, "grad_norm": 1.3526356220245361, "learning_rate": 6.078930242192391e-06, "loss": 0.5333, "step": 11080 }, { "epoch": 1.3002696681908783, "grad_norm": 1.1862406730651855, "learning_rate": 6.072911495648574e-06, "loss": 0.4561, "step": 11090 }, { "epoch": 1.3014421385860007, "grad_norm": 1.398084044456482, "learning_rate": 6.066891118960077e-06, "loss": 0.5077, "step": 11100 }, { "epoch": 1.3026146089811232, "grad_norm": 0.9979396462440491, "learning_rate": 6.060869121274052e-06, "loss": 0.4612, "step": 11110 }, { "epoch": 1.3037870793762458, "grad_norm": 1.2822518348693848, "learning_rate": 6.054845511740115e-06, "loss": 0.51, "step": 11120 }, { "epoch": 1.3049595497713682, "grad_norm": 1.34726881980896, "learning_rate": 6.048820299510329e-06, "loss": 0.492, "step": 11130 }, { "epoch": 1.3061320201664908, "grad_norm": 1.1825965642929077, "learning_rate": 6.042793493739196e-06, "loss": 0.5428, "step": 11140 }, { "epoch": 1.3073044905616134, "grad_norm": 1.097976565361023, "learning_rate": 6.0367651035836325e-06, "loss": 0.5241, "step": 11150 }, { "epoch": 1.3084769609567357, "grad_norm": 1.2326061725616455, "learning_rate": 6.030735138202971e-06, "loss": 0.5531, "step": 11160 }, { "epoch": 1.3096494313518583, "grad_norm": 1.2282270193099976, "learning_rate": 6.02470360675893e-06, "loss": 0.4658, "step": 11170 }, { "epoch": 1.3108219017469809, "grad_norm": 1.3594703674316406, "learning_rate": 6.01867051841561e-06, "loss": 0.4724, "step": 11180 }, { "epoch": 1.3119943721421035, "grad_norm": 1.5228323936462402, "learning_rate": 6.012635882339479e-06, "loss": 0.4954, "step": 11190 }, { "epoch": 1.313166842537226, "grad_norm": 1.3680020570755005, "learning_rate": 6.006599707699353e-06, "loss": 0.496, "step": 11200 }, { "epoch": 1.3143393129323484, "grad_norm": 1.3853033781051636, "learning_rate": 6.0005620036663855e-06, "loss": 0.4886, "step": 11210 }, { "epoch": 1.315511783327471, "grad_norm": 1.1824673414230347, "learning_rate": 5.994522779414061e-06, "loss": 0.5208, "step": 11220 }, { "epoch": 1.3166842537225936, "grad_norm": 1.231902837753296, "learning_rate": 5.9884820441181645e-06, "loss": 0.5201, "step": 11230 }, { "epoch": 1.317856724117716, "grad_norm": 1.4222874641418457, "learning_rate": 5.982439806956779e-06, "loss": 0.5588, "step": 11240 }, { "epoch": 1.3190291945128385, "grad_norm": 1.2598108053207397, "learning_rate": 5.976396077110276e-06, "loss": 0.5539, "step": 11250 }, { "epoch": 1.320201664907961, "grad_norm": 1.3098900318145752, "learning_rate": 5.970350863761285e-06, "loss": 0.464, "step": 11260 }, { "epoch": 1.3213741353030837, "grad_norm": 1.618918538093567, "learning_rate": 5.964304176094697e-06, "loss": 0.5137, "step": 11270 }, { "epoch": 1.3225466056982063, "grad_norm": 1.3800883293151855, "learning_rate": 5.958256023297642e-06, "loss": 0.5123, "step": 11280 }, { "epoch": 1.3237190760933286, "grad_norm": 1.1502987146377563, "learning_rate": 5.9522064145594705e-06, "loss": 0.4873, "step": 11290 }, { "epoch": 1.3248915464884512, "grad_norm": 1.3977410793304443, "learning_rate": 5.946155359071752e-06, "loss": 0.4988, "step": 11300 }, { "epoch": 1.3260640168835738, "grad_norm": 1.2127212285995483, "learning_rate": 5.94010286602825e-06, "loss": 0.5119, "step": 11310 }, { "epoch": 1.3272364872786961, "grad_norm": 1.3775907754898071, "learning_rate": 5.934048944624913e-06, "loss": 0.5319, "step": 11320 }, { "epoch": 1.3284089576738187, "grad_norm": 1.624830961227417, "learning_rate": 5.927993604059863e-06, "loss": 0.4799, "step": 11330 }, { "epoch": 1.3295814280689413, "grad_norm": 1.3471887111663818, "learning_rate": 5.921936853533373e-06, "loss": 0.4782, "step": 11340 }, { "epoch": 1.3307538984640637, "grad_norm": 1.308506727218628, "learning_rate": 5.91587870224786e-06, "loss": 0.5017, "step": 11350 }, { "epoch": 1.3319263688591862, "grad_norm": 1.4306446313858032, "learning_rate": 5.909819159407872e-06, "loss": 0.5104, "step": 11360 }, { "epoch": 1.3330988392543088, "grad_norm": 1.2396124601364136, "learning_rate": 5.9037582342200685e-06, "loss": 0.512, "step": 11370 }, { "epoch": 1.3342713096494314, "grad_norm": 1.3393092155456543, "learning_rate": 5.897695935893209e-06, "loss": 0.4845, "step": 11380 }, { "epoch": 1.335443780044554, "grad_norm": 1.4905699491500854, "learning_rate": 5.89163227363814e-06, "loss": 0.5063, "step": 11390 }, { "epoch": 1.3366162504396764, "grad_norm": 1.247475504875183, "learning_rate": 5.885567256667784e-06, "loss": 0.5124, "step": 11400 }, { "epoch": 1.337788720834799, "grad_norm": 1.3271054029464722, "learning_rate": 5.8795008941971125e-06, "loss": 0.4786, "step": 11410 }, { "epoch": 1.3389611912299215, "grad_norm": 1.4513108730316162, "learning_rate": 5.873433195443152e-06, "loss": 0.4994, "step": 11420 }, { "epoch": 1.3401336616250439, "grad_norm": 1.2124738693237305, "learning_rate": 5.867364169624953e-06, "loss": 0.504, "step": 11430 }, { "epoch": 1.3413061320201665, "grad_norm": 1.2546453475952148, "learning_rate": 5.861293825963583e-06, "loss": 0.4907, "step": 11440 }, { "epoch": 1.342478602415289, "grad_norm": 1.2588515281677246, "learning_rate": 5.855222173682113e-06, "loss": 0.528, "step": 11450 }, { "epoch": 1.3436510728104114, "grad_norm": 1.5712610483169556, "learning_rate": 5.849149222005604e-06, "loss": 0.5227, "step": 11460 }, { "epoch": 1.344823543205534, "grad_norm": 1.5296075344085693, "learning_rate": 5.843074980161086e-06, "loss": 0.4984, "step": 11470 }, { "epoch": 1.3459960136006566, "grad_norm": 1.2838728427886963, "learning_rate": 5.836999457377552e-06, "loss": 0.5325, "step": 11480 }, { "epoch": 1.3471684839957792, "grad_norm": 1.2027573585510254, "learning_rate": 5.830922662885945e-06, "loss": 0.5551, "step": 11490 }, { "epoch": 1.3483409543909017, "grad_norm": 1.3629884719848633, "learning_rate": 5.824844605919133e-06, "loss": 0.5055, "step": 11500 }, { "epoch": 1.3483409543909017, "eval_loss": 0.7162554264068604, "eval_model_preparation_time": 0.0, "eval_runtime": 2147.2565, "eval_samples_per_second": 3.531, "eval_steps_per_second": 1.766, "step": 11500 }, { "epoch": 1.349513424786024, "grad_norm": 1.1578705310821533, "learning_rate": 5.8187652957119076e-06, "loss": 0.5008, "step": 11510 }, { "epoch": 1.3506858951811467, "grad_norm": 1.4688276052474976, "learning_rate": 5.812684741500966e-06, "loss": 0.4835, "step": 11520 }, { "epoch": 1.3518583655762693, "grad_norm": 1.3659480810165405, "learning_rate": 5.806602952524887e-06, "loss": 0.5156, "step": 11530 }, { "epoch": 1.3530308359713916, "grad_norm": 1.3869564533233643, "learning_rate": 5.800519938024133e-06, "loss": 0.5184, "step": 11540 }, { "epoch": 1.3542033063665142, "grad_norm": 1.2423886060714722, "learning_rate": 5.794435707241025e-06, "loss": 0.4681, "step": 11550 }, { "epoch": 1.3553757767616368, "grad_norm": 1.2195144891738892, "learning_rate": 5.788350269419736e-06, "loss": 0.4894, "step": 11560 }, { "epoch": 1.3565482471567594, "grad_norm": 1.461804986000061, "learning_rate": 5.782263633806268e-06, "loss": 0.5399, "step": 11570 }, { "epoch": 1.357720717551882, "grad_norm": 1.040953516960144, "learning_rate": 5.776175809648444e-06, "loss": 0.4866, "step": 11580 }, { "epoch": 1.3588931879470043, "grad_norm": 1.2469749450683594, "learning_rate": 5.770086806195897e-06, "loss": 0.5559, "step": 11590 }, { "epoch": 1.360065658342127, "grad_norm": 1.2807772159576416, "learning_rate": 5.763996632700046e-06, "loss": 0.5052, "step": 11600 }, { "epoch": 1.3612381287372495, "grad_norm": 1.3151432275772095, "learning_rate": 5.7579052984140914e-06, "loss": 0.5342, "step": 11610 }, { "epoch": 1.3624105991323718, "grad_norm": 1.7146575450897217, "learning_rate": 5.751812812592996e-06, "loss": 0.5129, "step": 11620 }, { "epoch": 1.3635830695274944, "grad_norm": 1.2460517883300781, "learning_rate": 5.745719184493472e-06, "loss": 0.5352, "step": 11630 }, { "epoch": 1.364755539922617, "grad_norm": 1.5328826904296875, "learning_rate": 5.739624423373967e-06, "loss": 0.5028, "step": 11640 }, { "epoch": 1.3659280103177394, "grad_norm": 1.4083796739578247, "learning_rate": 5.733528538494654e-06, "loss": 0.5242, "step": 11650 }, { "epoch": 1.367100480712862, "grad_norm": 1.726281762123108, "learning_rate": 5.727431539117405e-06, "loss": 0.4686, "step": 11660 }, { "epoch": 1.3682729511079845, "grad_norm": 1.286133885383606, "learning_rate": 5.7213334345057936e-06, "loss": 0.4934, "step": 11670 }, { "epoch": 1.369445421503107, "grad_norm": 1.1853915452957153, "learning_rate": 5.715234233925069e-06, "loss": 0.5124, "step": 11680 }, { "epoch": 1.3706178918982297, "grad_norm": 1.30171537399292, "learning_rate": 5.709133946642144e-06, "loss": 0.4852, "step": 11690 }, { "epoch": 1.371790362293352, "grad_norm": 1.1958101987838745, "learning_rate": 5.703032581925587e-06, "loss": 0.4944, "step": 11700 }, { "epoch": 1.3729628326884746, "grad_norm": 1.2722560167312622, "learning_rate": 5.6969301490456e-06, "loss": 0.4917, "step": 11710 }, { "epoch": 1.3741353030835972, "grad_norm": 1.2838225364685059, "learning_rate": 5.690826657274009e-06, "loss": 0.5668, "step": 11720 }, { "epoch": 1.3753077734787196, "grad_norm": 1.1619336605072021, "learning_rate": 5.684722115884247e-06, "loss": 0.4849, "step": 11730 }, { "epoch": 1.3764802438738422, "grad_norm": 1.3464888334274292, "learning_rate": 5.6786165341513455e-06, "loss": 0.5011, "step": 11740 }, { "epoch": 1.3776527142689647, "grad_norm": 1.221006989479065, "learning_rate": 5.672509921351915e-06, "loss": 0.5247, "step": 11750 }, { "epoch": 1.378825184664087, "grad_norm": 1.29966139793396, "learning_rate": 5.6664022867641284e-06, "loss": 0.4789, "step": 11760 }, { "epoch": 1.3799976550592097, "grad_norm": 1.3600149154663086, "learning_rate": 5.660293639667719e-06, "loss": 0.5433, "step": 11770 }, { "epoch": 1.3811701254543323, "grad_norm": 1.2525006532669067, "learning_rate": 5.654183989343952e-06, "loss": 0.5461, "step": 11780 }, { "epoch": 1.3823425958494548, "grad_norm": 1.1985055208206177, "learning_rate": 5.648073345075621e-06, "loss": 0.5154, "step": 11790 }, { "epoch": 1.3835150662445774, "grad_norm": 1.1581168174743652, "learning_rate": 5.641961716147025e-06, "loss": 0.5004, "step": 11800 }, { "epoch": 1.3846875366396998, "grad_norm": 1.4538780450820923, "learning_rate": 5.635849111843965e-06, "loss": 0.5043, "step": 11810 }, { "epoch": 1.3858600070348224, "grad_norm": 1.2476485967636108, "learning_rate": 5.629735541453719e-06, "loss": 0.4978, "step": 11820 }, { "epoch": 1.387032477429945, "grad_norm": 1.3988218307495117, "learning_rate": 5.623621014265034e-06, "loss": 0.5002, "step": 11830 }, { "epoch": 1.3882049478250673, "grad_norm": 1.2321653366088867, "learning_rate": 5.617505539568114e-06, "loss": 0.5354, "step": 11840 }, { "epoch": 1.38937741822019, "grad_norm": 1.3794342279434204, "learning_rate": 5.6113891266546e-06, "loss": 0.4922, "step": 11850 }, { "epoch": 1.3905498886153125, "grad_norm": 1.1712243556976318, "learning_rate": 5.605271784817557e-06, "loss": 0.5017, "step": 11860 }, { "epoch": 1.391722359010435, "grad_norm": 1.3573167324066162, "learning_rate": 5.599153523351462e-06, "loss": 0.491, "step": 11870 }, { "epoch": 1.3928948294055576, "grad_norm": 1.1764177083969116, "learning_rate": 5.593034351552192e-06, "loss": 0.5332, "step": 11880 }, { "epoch": 1.39406729980068, "grad_norm": 1.4443508386611938, "learning_rate": 5.586914278717006e-06, "loss": 0.5323, "step": 11890 }, { "epoch": 1.3952397701958026, "grad_norm": 1.3941154479980469, "learning_rate": 5.580793314144529e-06, "loss": 0.4892, "step": 11900 }, { "epoch": 1.3964122405909252, "grad_norm": 1.4483630657196045, "learning_rate": 5.574671467134746e-06, "loss": 0.5453, "step": 11910 }, { "epoch": 1.3975847109860475, "grad_norm": 1.6313278675079346, "learning_rate": 5.568548746988978e-06, "loss": 0.4765, "step": 11920 }, { "epoch": 1.3987571813811701, "grad_norm": 1.2649754285812378, "learning_rate": 5.562425163009873e-06, "loss": 0.4799, "step": 11930 }, { "epoch": 1.3999296517762927, "grad_norm": 1.7048743963241577, "learning_rate": 5.556300724501397e-06, "loss": 0.5164, "step": 11940 }, { "epoch": 1.401102122171415, "grad_norm": 1.4407027959823608, "learning_rate": 5.55017544076881e-06, "loss": 0.4733, "step": 11950 }, { "epoch": 1.4022745925665376, "grad_norm": 1.3245989084243774, "learning_rate": 5.544049321118653e-06, "loss": 0.5091, "step": 11960 }, { "epoch": 1.4034470629616602, "grad_norm": 1.2318891286849976, "learning_rate": 5.537922374858743e-06, "loss": 0.5278, "step": 11970 }, { "epoch": 1.4046195333567828, "grad_norm": 1.2656196355819702, "learning_rate": 5.531794611298152e-06, "loss": 0.5061, "step": 11980 }, { "epoch": 1.4057920037519054, "grad_norm": 1.5293779373168945, "learning_rate": 5.525666039747189e-06, "loss": 0.5371, "step": 11990 }, { "epoch": 1.4069644741470277, "grad_norm": 1.4670557975769043, "learning_rate": 5.519536669517396e-06, "loss": 0.5258, "step": 12000 }, { "epoch": 1.4069644741470277, "eval_loss": 0.7116261720657349, "eval_model_preparation_time": 0.0, "eval_runtime": 2143.844, "eval_samples_per_second": 3.537, "eval_steps_per_second": 1.768, "step": 12000 }, { "epoch": 1.4081369445421503, "grad_norm": 1.4891761541366577, "learning_rate": 5.513406509921527e-06, "loss": 0.5148, "step": 12010 }, { "epoch": 1.409309414937273, "grad_norm": 1.3348000049591064, "learning_rate": 5.507275570273535e-06, "loss": 0.5081, "step": 12020 }, { "epoch": 1.4104818853323953, "grad_norm": 1.5534698963165283, "learning_rate": 5.501143859888556e-06, "loss": 0.5114, "step": 12030 }, { "epoch": 1.4116543557275179, "grad_norm": 1.4283145666122437, "learning_rate": 5.4950113880829e-06, "loss": 0.51, "step": 12040 }, { "epoch": 1.4128268261226404, "grad_norm": 1.4259769916534424, "learning_rate": 5.488878164174034e-06, "loss": 0.496, "step": 12050 }, { "epoch": 1.4139992965177628, "grad_norm": 1.3630540370941162, "learning_rate": 5.482744197480564e-06, "loss": 0.4809, "step": 12060 }, { "epoch": 1.4151717669128854, "grad_norm": 1.5067250728607178, "learning_rate": 5.47660949732223e-06, "loss": 0.5608, "step": 12070 }, { "epoch": 1.416344237308008, "grad_norm": 1.2912323474884033, "learning_rate": 5.470474073019884e-06, "loss": 0.454, "step": 12080 }, { "epoch": 1.4175167077031305, "grad_norm": 1.3794432878494263, "learning_rate": 5.464337933895473e-06, "loss": 0.5338, "step": 12090 }, { "epoch": 1.4186891780982531, "grad_norm": 1.1614420413970947, "learning_rate": 5.4582010892720404e-06, "loss": 0.5776, "step": 12100 }, { "epoch": 1.4198616484933755, "grad_norm": 1.3003995418548584, "learning_rate": 5.452063548473694e-06, "loss": 0.5, "step": 12110 }, { "epoch": 1.421034118888498, "grad_norm": 1.1236634254455566, "learning_rate": 5.4459253208256e-06, "loss": 0.5303, "step": 12120 }, { "epoch": 1.4222065892836206, "grad_norm": 1.3469139337539673, "learning_rate": 5.439786415653972e-06, "loss": 0.4728, "step": 12130 }, { "epoch": 1.423379059678743, "grad_norm": 1.3524476289749146, "learning_rate": 5.433646842286049e-06, "loss": 0.5481, "step": 12140 }, { "epoch": 1.4245515300738656, "grad_norm": 1.3269134759902954, "learning_rate": 5.427506610050086e-06, "loss": 0.5298, "step": 12150 }, { "epoch": 1.4257240004689882, "grad_norm": 1.2551853656768799, "learning_rate": 5.421365728275343e-06, "loss": 0.5457, "step": 12160 }, { "epoch": 1.4268964708641108, "grad_norm": 1.457955002784729, "learning_rate": 5.415224206292059e-06, "loss": 0.5067, "step": 12170 }, { "epoch": 1.4280689412592333, "grad_norm": 1.258002758026123, "learning_rate": 5.409082053431457e-06, "loss": 0.5366, "step": 12180 }, { "epoch": 1.4292414116543557, "grad_norm": 1.133952021598816, "learning_rate": 5.402939279025705e-06, "loss": 0.4831, "step": 12190 }, { "epoch": 1.4304138820494783, "grad_norm": 1.2200407981872559, "learning_rate": 5.396795892407926e-06, "loss": 0.4657, "step": 12200 }, { "epoch": 1.4315863524446009, "grad_norm": 1.4325788021087646, "learning_rate": 5.39065190291217e-06, "loss": 0.5727, "step": 12210 }, { "epoch": 1.4327588228397232, "grad_norm": 2.00761079788208, "learning_rate": 5.3845073198734e-06, "loss": 0.54, "step": 12220 }, { "epoch": 1.4339312932348458, "grad_norm": 1.4783564805984497, "learning_rate": 5.3783621526274845e-06, "loss": 0.4936, "step": 12230 }, { "epoch": 1.4351037636299684, "grad_norm": 1.4188710451126099, "learning_rate": 5.372216410511179e-06, "loss": 0.4456, "step": 12240 }, { "epoch": 1.4362762340250907, "grad_norm": 1.3198959827423096, "learning_rate": 5.366070102862113e-06, "loss": 0.4917, "step": 12250 }, { "epoch": 1.4374487044202133, "grad_norm": 1.2150040864944458, "learning_rate": 5.359923239018771e-06, "loss": 0.4879, "step": 12260 }, { "epoch": 1.438621174815336, "grad_norm": 1.2523537874221802, "learning_rate": 5.353775828320489e-06, "loss": 0.5009, "step": 12270 }, { "epoch": 1.4397936452104585, "grad_norm": 1.7466691732406616, "learning_rate": 5.347627880107429e-06, "loss": 0.4825, "step": 12280 }, { "epoch": 1.440966115605581, "grad_norm": 1.3185189962387085, "learning_rate": 5.341479403720569e-06, "loss": 0.5019, "step": 12290 }, { "epoch": 1.4421385860007034, "grad_norm": 1.4928735494613647, "learning_rate": 5.335330408501696e-06, "loss": 0.4813, "step": 12300 }, { "epoch": 1.443311056395826, "grad_norm": 1.2649588584899902, "learning_rate": 5.329180903793377e-06, "loss": 0.5403, "step": 12310 }, { "epoch": 1.4444835267909486, "grad_norm": 1.3241353034973145, "learning_rate": 5.323030898938959e-06, "loss": 0.532, "step": 12320 }, { "epoch": 1.445655997186071, "grad_norm": 1.2303539514541626, "learning_rate": 5.316880403282548e-06, "loss": 0.4453, "step": 12330 }, { "epoch": 1.4468284675811935, "grad_norm": 1.1965874433517456, "learning_rate": 5.3107294261689946e-06, "loss": 0.5081, "step": 12340 }, { "epoch": 1.4480009379763161, "grad_norm": 1.2996339797973633, "learning_rate": 5.304577976943877e-06, "loss": 0.4725, "step": 12350 }, { "epoch": 1.4491734083714385, "grad_norm": 1.409911870956421, "learning_rate": 5.298426064953501e-06, "loss": 0.5786, "step": 12360 }, { "epoch": 1.450345878766561, "grad_norm": 1.3770809173583984, "learning_rate": 5.292273699544866e-06, "loss": 0.4762, "step": 12370 }, { "epoch": 1.4515183491616837, "grad_norm": 1.4983558654785156, "learning_rate": 5.286120890065662e-06, "loss": 0.5235, "step": 12380 }, { "epoch": 1.4526908195568062, "grad_norm": 1.3937098979949951, "learning_rate": 5.279967645864259e-06, "loss": 0.5212, "step": 12390 }, { "epoch": 1.4538632899519288, "grad_norm": 1.2485182285308838, "learning_rate": 5.27381397628968e-06, "loss": 0.4972, "step": 12400 }, { "epoch": 1.4550357603470512, "grad_norm": 1.5129380226135254, "learning_rate": 5.267659890691601e-06, "loss": 0.5305, "step": 12410 }, { "epoch": 1.4562082307421738, "grad_norm": 1.2059566974639893, "learning_rate": 5.2615053984203255e-06, "loss": 0.5138, "step": 12420 }, { "epoch": 1.4573807011372963, "grad_norm": 1.288607120513916, "learning_rate": 5.255350508826777e-06, "loss": 0.5164, "step": 12430 }, { "epoch": 1.4585531715324187, "grad_norm": 1.6773043870925903, "learning_rate": 5.249195231262483e-06, "loss": 0.4981, "step": 12440 }, { "epoch": 1.4597256419275413, "grad_norm": 1.4500031471252441, "learning_rate": 5.243039575079559e-06, "loss": 0.5101, "step": 12450 }, { "epoch": 1.4608981123226639, "grad_norm": 1.6095887422561646, "learning_rate": 5.236883549630696e-06, "loss": 0.4782, "step": 12460 }, { "epoch": 1.4620705827177864, "grad_norm": 1.5660251379013062, "learning_rate": 5.230727164269147e-06, "loss": 0.5813, "step": 12470 }, { "epoch": 1.463243053112909, "grad_norm": 1.2926603555679321, "learning_rate": 5.224570428348712e-06, "loss": 0.5325, "step": 12480 }, { "epoch": 1.4644155235080314, "grad_norm": 1.4387691020965576, "learning_rate": 5.2184133512237214e-06, "loss": 0.5104, "step": 12490 }, { "epoch": 1.465587993903154, "grad_norm": 1.3331825733184814, "learning_rate": 5.212255942249027e-06, "loss": 0.4936, "step": 12500 }, { "epoch": 1.465587993903154, "eval_loss": 0.7078402638435364, "eval_model_preparation_time": 0.0, "eval_runtime": 2146.8315, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.766, "step": 12500 }, { "epoch": 1.4667604642982766, "grad_norm": 1.2927879095077515, "learning_rate": 5.206098210779983e-06, "loss": 0.499, "step": 12510 }, { "epoch": 1.467932934693399, "grad_norm": 1.2485418319702148, "learning_rate": 5.199940166172432e-06, "loss": 0.4743, "step": 12520 }, { "epoch": 1.4691054050885215, "grad_norm": 1.2938628196716309, "learning_rate": 5.1937818177827e-06, "loss": 0.4869, "step": 12530 }, { "epoch": 1.470277875483644, "grad_norm": 1.2267080545425415, "learning_rate": 5.187623174967562e-06, "loss": 0.525, "step": 12540 }, { "epoch": 1.4714503458787664, "grad_norm": 1.8241894245147705, "learning_rate": 5.181464247084252e-06, "loss": 0.5346, "step": 12550 }, { "epoch": 1.472622816273889, "grad_norm": 1.2930456399917603, "learning_rate": 5.1753050434904295e-06, "loss": 0.5177, "step": 12560 }, { "epoch": 1.4737952866690116, "grad_norm": 1.8011369705200195, "learning_rate": 5.169145573544177e-06, "loss": 0.4957, "step": 12570 }, { "epoch": 1.4749677570641342, "grad_norm": 1.1115459203720093, "learning_rate": 5.16298584660398e-06, "loss": 0.453, "step": 12580 }, { "epoch": 1.4761402274592568, "grad_norm": 1.435603141784668, "learning_rate": 5.156825872028718e-06, "loss": 0.5221, "step": 12590 }, { "epoch": 1.4773126978543791, "grad_norm": 1.3356236219406128, "learning_rate": 5.1506656591776395e-06, "loss": 0.5034, "step": 12600 }, { "epoch": 1.4784851682495017, "grad_norm": 1.4777098894119263, "learning_rate": 5.14450521741036e-06, "loss": 0.476, "step": 12610 }, { "epoch": 1.4796576386446243, "grad_norm": 1.3276792764663696, "learning_rate": 5.138344556086843e-06, "loss": 0.477, "step": 12620 }, { "epoch": 1.4808301090397467, "grad_norm": 1.5110769271850586, "learning_rate": 5.132183684567384e-06, "loss": 0.4625, "step": 12630 }, { "epoch": 1.4820025794348692, "grad_norm": 1.3571935892105103, "learning_rate": 5.1260226122126e-06, "loss": 0.5227, "step": 12640 }, { "epoch": 1.4831750498299918, "grad_norm": 1.0874842405319214, "learning_rate": 5.11986134838341e-06, "loss": 0.4905, "step": 12650 }, { "epoch": 1.4843475202251142, "grad_norm": 1.388489842414856, "learning_rate": 5.113699902441025e-06, "loss": 0.5132, "step": 12660 }, { "epoch": 1.4855199906202368, "grad_norm": 1.7157195806503296, "learning_rate": 5.1075382837469355e-06, "loss": 0.5577, "step": 12670 }, { "epoch": 1.4866924610153593, "grad_norm": 1.264939785003662, "learning_rate": 5.10137650166289e-06, "loss": 0.5341, "step": 12680 }, { "epoch": 1.487864931410482, "grad_norm": 1.9557205438613892, "learning_rate": 5.0952145655508875e-06, "loss": 0.4751, "step": 12690 }, { "epoch": 1.4890374018056045, "grad_norm": 1.356464147567749, "learning_rate": 5.089052484773163e-06, "loss": 0.5003, "step": 12700 }, { "epoch": 1.4902098722007269, "grad_norm": 1.256229043006897, "learning_rate": 5.082890268692169e-06, "loss": 0.5322, "step": 12710 }, { "epoch": 1.4913823425958495, "grad_norm": 1.2527382373809814, "learning_rate": 5.07672792667056e-06, "loss": 0.5327, "step": 12720 }, { "epoch": 1.492554812990972, "grad_norm": 1.2529520988464355, "learning_rate": 5.070565468071191e-06, "loss": 0.5411, "step": 12730 }, { "epoch": 1.4937272833860944, "grad_norm": 1.5388060808181763, "learning_rate": 5.064402902257086e-06, "loss": 0.4889, "step": 12740 }, { "epoch": 1.494899753781217, "grad_norm": 1.4744281768798828, "learning_rate": 5.058240238591435e-06, "loss": 0.5327, "step": 12750 }, { "epoch": 1.4960722241763396, "grad_norm": 1.2421237230300903, "learning_rate": 5.052077486437576e-06, "loss": 0.4757, "step": 12760 }, { "epoch": 1.4972446945714621, "grad_norm": 1.4800385236740112, "learning_rate": 5.045914655158982e-06, "loss": 0.5371, "step": 12770 }, { "epoch": 1.4984171649665847, "grad_norm": 1.738107442855835, "learning_rate": 5.039751754119247e-06, "loss": 0.509, "step": 12780 }, { "epoch": 1.499589635361707, "grad_norm": 1.3127175569534302, "learning_rate": 5.033588792682067e-06, "loss": 0.5096, "step": 12790 }, { "epoch": 1.5007621057568297, "grad_norm": 1.4286155700683594, "learning_rate": 5.027425780211239e-06, "loss": 0.5182, "step": 12800 }, { "epoch": 1.5019345761519522, "grad_norm": 1.2364673614501953, "learning_rate": 5.021262726070625e-06, "loss": 0.518, "step": 12810 }, { "epoch": 1.5031070465470746, "grad_norm": 1.4245611429214478, "learning_rate": 5.0150996396241605e-06, "loss": 0.539, "step": 12820 }, { "epoch": 1.5042795169421972, "grad_norm": 1.3482962846755981, "learning_rate": 5.008936530235825e-06, "loss": 0.5148, "step": 12830 }, { "epoch": 1.5054519873373198, "grad_norm": 1.2987070083618164, "learning_rate": 5.002773407269632e-06, "loss": 0.4705, "step": 12840 }, { "epoch": 1.5066244577324421, "grad_norm": 1.7442587614059448, "learning_rate": 4.996610280089622e-06, "loss": 0.5286, "step": 12850 }, { "epoch": 1.507796928127565, "grad_norm": 1.397152304649353, "learning_rate": 4.9904471580598334e-06, "loss": 0.5018, "step": 12860 }, { "epoch": 1.5089693985226873, "grad_norm": 1.5800492763519287, "learning_rate": 4.9842840505443045e-06, "loss": 0.4401, "step": 12870 }, { "epoch": 1.5101418689178097, "grad_norm": 1.6143877506256104, "learning_rate": 4.978120966907047e-06, "loss": 0.5121, "step": 12880 }, { "epoch": 1.5113143393129325, "grad_norm": 1.4067411422729492, "learning_rate": 4.971957916512035e-06, "loss": 0.5301, "step": 12890 }, { "epoch": 1.5124868097080548, "grad_norm": 1.8939471244812012, "learning_rate": 4.965794908723197e-06, "loss": 0.5331, "step": 12900 }, { "epoch": 1.5136592801031774, "grad_norm": 1.435133457183838, "learning_rate": 4.9596319529043955e-06, "loss": 0.5252, "step": 12910 }, { "epoch": 1.5148317504983, "grad_norm": 1.3660833835601807, "learning_rate": 4.953469058419409e-06, "loss": 0.4863, "step": 12920 }, { "epoch": 1.5160042208934223, "grad_norm": 1.481065034866333, "learning_rate": 4.947306234631929e-06, "loss": 0.5074, "step": 12930 }, { "epoch": 1.517176691288545, "grad_norm": 1.2855244874954224, "learning_rate": 4.941143490905537e-06, "loss": 0.5139, "step": 12940 }, { "epoch": 1.5183491616836675, "grad_norm": 1.3858916759490967, "learning_rate": 4.93498083660369e-06, "loss": 0.4871, "step": 12950 }, { "epoch": 1.5195216320787899, "grad_norm": 1.4065240621566772, "learning_rate": 4.928818281089719e-06, "loss": 0.5263, "step": 12960 }, { "epoch": 1.5206941024739127, "grad_norm": 1.281459093093872, "learning_rate": 4.922655833726792e-06, "loss": 0.4654, "step": 12970 }, { "epoch": 1.521866572869035, "grad_norm": 1.3481875658035278, "learning_rate": 4.916493503877921e-06, "loss": 0.5021, "step": 12980 }, { "epoch": 1.5230390432641576, "grad_norm": 1.5862677097320557, "learning_rate": 4.910331300905936e-06, "loss": 0.4362, "step": 12990 }, { "epoch": 1.5242115136592802, "grad_norm": 1.0769898891448975, "learning_rate": 4.904169234173477e-06, "loss": 0.5132, "step": 13000 }, { "epoch": 1.5242115136592802, "eval_loss": 0.706162691116333, "eval_model_preparation_time": 0.0, "eval_runtime": 2143.5551, "eval_samples_per_second": 3.537, "eval_steps_per_second": 1.769, "step": 13000 }, { "epoch": 1.5253839840544026, "grad_norm": 1.3612005710601807, "learning_rate": 4.898007313042975e-06, "loss": 0.523, "step": 13010 }, { "epoch": 1.5265564544495251, "grad_norm": 1.328575611114502, "learning_rate": 4.891845546876638e-06, "loss": 0.5091, "step": 13020 }, { "epoch": 1.5277289248446477, "grad_norm": 1.4444586038589478, "learning_rate": 4.885683945036445e-06, "loss": 0.4967, "step": 13030 }, { "epoch": 1.52890139523977, "grad_norm": 1.2870855331420898, "learning_rate": 4.879522516884118e-06, "loss": 0.5136, "step": 13040 }, { "epoch": 1.5300738656348927, "grad_norm": 1.2046009302139282, "learning_rate": 4.8733612717811164e-06, "loss": 0.523, "step": 13050 }, { "epoch": 1.5312463360300153, "grad_norm": 1.422859787940979, "learning_rate": 4.86720021908863e-06, "loss": 0.4261, "step": 13060 }, { "epoch": 1.5324188064251376, "grad_norm": 1.5092412233352661, "learning_rate": 4.861039368167543e-06, "loss": 0.5161, "step": 13070 }, { "epoch": 1.5335912768202604, "grad_norm": 1.2206389904022217, "learning_rate": 4.8548787283784406e-06, "loss": 0.4647, "step": 13080 }, { "epoch": 1.5347637472153828, "grad_norm": 1.325558066368103, "learning_rate": 4.848718309081589e-06, "loss": 0.4993, "step": 13090 }, { "epoch": 1.5359362176105054, "grad_norm": 1.2404786348342896, "learning_rate": 4.842558119636917e-06, "loss": 0.5222, "step": 13100 }, { "epoch": 1.537108688005628, "grad_norm": 1.2435827255249023, "learning_rate": 4.836398169403998e-06, "loss": 0.5081, "step": 13110 }, { "epoch": 1.5382811584007503, "grad_norm": 1.2034730911254883, "learning_rate": 4.830238467742053e-06, "loss": 0.5401, "step": 13120 }, { "epoch": 1.5394536287958729, "grad_norm": 1.288844347000122, "learning_rate": 4.824079024009921e-06, "loss": 0.535, "step": 13130 }, { "epoch": 1.5406260991909955, "grad_norm": 1.472651720046997, "learning_rate": 4.817919847566041e-06, "loss": 0.4987, "step": 13140 }, { "epoch": 1.5417985695861178, "grad_norm": 1.4571154117584229, "learning_rate": 4.81176094776846e-06, "loss": 0.4588, "step": 13150 }, { "epoch": 1.5429710399812406, "grad_norm": 1.4962091445922852, "learning_rate": 4.805602333974795e-06, "loss": 0.4979, "step": 13160 }, { "epoch": 1.544143510376363, "grad_norm": 1.3251254558563232, "learning_rate": 4.799444015542232e-06, "loss": 0.4903, "step": 13170 }, { "epoch": 1.5453159807714854, "grad_norm": 1.4356613159179688, "learning_rate": 4.793286001827506e-06, "loss": 0.543, "step": 13180 }, { "epoch": 1.5464884511666082, "grad_norm": 1.0942418575286865, "learning_rate": 4.7871283021868926e-06, "loss": 0.4901, "step": 13190 }, { "epoch": 1.5476609215617305, "grad_norm": 1.3230329751968384, "learning_rate": 4.780970925976185e-06, "loss": 0.4791, "step": 13200 }, { "epoch": 1.548833391956853, "grad_norm": 1.3526240587234497, "learning_rate": 4.774813882550691e-06, "loss": 0.5116, "step": 13210 }, { "epoch": 1.5500058623519757, "grad_norm": 1.311937689781189, "learning_rate": 4.768657181265208e-06, "loss": 0.5071, "step": 13220 }, { "epoch": 1.551178332747098, "grad_norm": 1.2711786031723022, "learning_rate": 4.762500831474015e-06, "loss": 0.5147, "step": 13230 }, { "epoch": 1.5523508031422206, "grad_norm": 1.1421915292739868, "learning_rate": 4.756344842530859e-06, "loss": 0.5649, "step": 13240 }, { "epoch": 1.5535232735373432, "grad_norm": 1.2885173559188843, "learning_rate": 4.750189223788936e-06, "loss": 0.506, "step": 13250 }, { "epoch": 1.5546957439324656, "grad_norm": 1.419407606124878, "learning_rate": 4.74403398460088e-06, "loss": 0.4988, "step": 13260 }, { "epoch": 1.5558682143275884, "grad_norm": 1.5271703004837036, "learning_rate": 4.737879134318751e-06, "loss": 0.4904, "step": 13270 }, { "epoch": 1.5570406847227107, "grad_norm": 1.2795103788375854, "learning_rate": 4.731724682294014e-06, "loss": 0.4808, "step": 13280 }, { "epoch": 1.5582131551178333, "grad_norm": 1.427195429801941, "learning_rate": 4.725570637877531e-06, "loss": 0.508, "step": 13290 }, { "epoch": 1.559385625512956, "grad_norm": 1.3597995042800903, "learning_rate": 4.719417010419546e-06, "loss": 0.5122, "step": 13300 }, { "epoch": 1.5605580959080783, "grad_norm": 1.6512583494186401, "learning_rate": 4.713263809269666e-06, "loss": 0.5766, "step": 13310 }, { "epoch": 1.5617305663032008, "grad_norm": 1.2688370943069458, "learning_rate": 4.707111043776854e-06, "loss": 0.4673, "step": 13320 }, { "epoch": 1.5629030366983234, "grad_norm": 1.3154330253601074, "learning_rate": 4.70095872328941e-06, "loss": 0.5332, "step": 13330 }, { "epoch": 1.5640755070934458, "grad_norm": 1.3333938121795654, "learning_rate": 4.6948068571549555e-06, "loss": 0.4922, "step": 13340 }, { "epoch": 1.5652479774885684, "grad_norm": 1.4163548946380615, "learning_rate": 4.688655454720424e-06, "loss": 0.4968, "step": 13350 }, { "epoch": 1.566420447883691, "grad_norm": 1.8946317434310913, "learning_rate": 4.682504525332045e-06, "loss": 0.4962, "step": 13360 }, { "epoch": 1.5675929182788133, "grad_norm": 1.2292650938034058, "learning_rate": 4.676354078335327e-06, "loss": 0.5038, "step": 13370 }, { "epoch": 1.5687653886739361, "grad_norm": 1.5814419984817505, "learning_rate": 4.6702041230750476e-06, "loss": 0.5096, "step": 13380 }, { "epoch": 1.5699378590690585, "grad_norm": 1.4913325309753418, "learning_rate": 4.664054668895236e-06, "loss": 0.5263, "step": 13390 }, { "epoch": 1.571110329464181, "grad_norm": 1.0880714654922485, "learning_rate": 4.657905725139161e-06, "loss": 0.4858, "step": 13400 }, { "epoch": 1.5722827998593036, "grad_norm": 1.231573462486267, "learning_rate": 4.6517573011493125e-06, "loss": 0.5152, "step": 13410 }, { "epoch": 1.573455270254426, "grad_norm": 1.6119964122772217, "learning_rate": 4.6456094062674e-06, "loss": 0.5348, "step": 13420 }, { "epoch": 1.5746277406495486, "grad_norm": 1.6427700519561768, "learning_rate": 4.639462049834318e-06, "loss": 0.5248, "step": 13430 }, { "epoch": 1.5758002110446712, "grad_norm": 1.3604979515075684, "learning_rate": 4.633315241190146e-06, "loss": 0.4841, "step": 13440 }, { "epoch": 1.5769726814397935, "grad_norm": 1.2496000528335571, "learning_rate": 4.627168989674139e-06, "loss": 0.4496, "step": 13450 }, { "epoch": 1.5781451518349163, "grad_norm": 1.470560908317566, "learning_rate": 4.6210233046246946e-06, "loss": 0.5047, "step": 13460 }, { "epoch": 1.5793176222300387, "grad_norm": 1.2653249502182007, "learning_rate": 4.614878195379353e-06, "loss": 0.4996, "step": 13470 }, { "epoch": 1.5804900926251613, "grad_norm": 1.3563830852508545, "learning_rate": 4.608733671274786e-06, "loss": 0.5029, "step": 13480 }, { "epoch": 1.5816625630202839, "grad_norm": 1.4104876518249512, "learning_rate": 4.60258974164677e-06, "loss": 0.4994, "step": 13490 }, { "epoch": 1.5828350334154062, "grad_norm": 1.2753397226333618, "learning_rate": 4.596446415830176e-06, "loss": 0.4776, "step": 13500 }, { "epoch": 1.5828350334154062, "eval_loss": 0.7014015316963196, "eval_model_preparation_time": 0.0, "eval_runtime": 2149.3061, "eval_samples_per_second": 3.528, "eval_steps_per_second": 1.764, "step": 13500 }, { "epoch": 1.5840075038105288, "grad_norm": 1.3716660737991333, "learning_rate": 4.590303703158965e-06, "loss": 0.4294, "step": 13510 }, { "epoch": 1.5851799742056514, "grad_norm": 1.3998048305511475, "learning_rate": 4.584161612966162e-06, "loss": 0.4812, "step": 13520 }, { "epoch": 1.5863524446007737, "grad_norm": 1.3521205186843872, "learning_rate": 4.578020154583843e-06, "loss": 0.5116, "step": 13530 }, { "epoch": 1.5875249149958963, "grad_norm": 1.3397890329360962, "learning_rate": 4.571879337343132e-06, "loss": 0.4841, "step": 13540 }, { "epoch": 1.588697385391019, "grad_norm": 1.3064327239990234, "learning_rate": 4.565739170574175e-06, "loss": 0.4461, "step": 13550 }, { "epoch": 1.5898698557861413, "grad_norm": 1.3202391862869263, "learning_rate": 4.5595996636061254e-06, "loss": 0.5032, "step": 13560 }, { "epoch": 1.591042326181264, "grad_norm": 1.1931742429733276, "learning_rate": 4.553460825767142e-06, "loss": 0.4532, "step": 13570 }, { "epoch": 1.5922147965763864, "grad_norm": 1.369465708732605, "learning_rate": 4.547322666384361e-06, "loss": 0.4658, "step": 13580 }, { "epoch": 1.593387266971509, "grad_norm": 1.359597086906433, "learning_rate": 4.54118519478389e-06, "loss": 0.4609, "step": 13590 }, { "epoch": 1.5945597373666316, "grad_norm": 1.3049198389053345, "learning_rate": 4.535048420290792e-06, "loss": 0.4712, "step": 13600 }, { "epoch": 1.595732207761754, "grad_norm": 1.3125094175338745, "learning_rate": 4.52891235222907e-06, "loss": 0.5444, "step": 13610 }, { "epoch": 1.5969046781568765, "grad_norm": 1.4030203819274902, "learning_rate": 4.522776999921655e-06, "loss": 0.4905, "step": 13620 }, { "epoch": 1.5980771485519991, "grad_norm": 1.2270499467849731, "learning_rate": 4.516642372690386e-06, "loss": 0.4909, "step": 13630 }, { "epoch": 1.5992496189471215, "grad_norm": 1.3604707717895508, "learning_rate": 4.510508479856008e-06, "loss": 0.5021, "step": 13640 }, { "epoch": 1.600422089342244, "grad_norm": 1.1583906412124634, "learning_rate": 4.504375330738144e-06, "loss": 0.5027, "step": 13650 }, { "epoch": 1.6015945597373666, "grad_norm": 1.3817378282546997, "learning_rate": 4.498242934655289e-06, "loss": 0.4663, "step": 13660 }, { "epoch": 1.602767030132489, "grad_norm": 1.4213110208511353, "learning_rate": 4.4921113009247944e-06, "loss": 0.4681, "step": 13670 }, { "epoch": 1.6039395005276118, "grad_norm": 1.5306822061538696, "learning_rate": 4.485980438862852e-06, "loss": 0.4939, "step": 13680 }, { "epoch": 1.6051119709227342, "grad_norm": 1.5451979637145996, "learning_rate": 4.4798503577844824e-06, "loss": 0.4842, "step": 13690 }, { "epoch": 1.6062844413178567, "grad_norm": 1.232155203819275, "learning_rate": 4.473721067003519e-06, "loss": 0.4782, "step": 13700 }, { "epoch": 1.6074569117129793, "grad_norm": 1.644090175628662, "learning_rate": 4.467592575832595e-06, "loss": 0.473, "step": 13710 }, { "epoch": 1.6086293821081017, "grad_norm": 1.2632887363433838, "learning_rate": 4.461464893583127e-06, "loss": 0.4777, "step": 13720 }, { "epoch": 1.6098018525032243, "grad_norm": 1.6878037452697754, "learning_rate": 4.455338029565306e-06, "loss": 0.4814, "step": 13730 }, { "epoch": 1.6109743228983469, "grad_norm": 1.4315028190612793, "learning_rate": 4.449211993088076e-06, "loss": 0.5058, "step": 13740 }, { "epoch": 1.6121467932934692, "grad_norm": 1.4610106945037842, "learning_rate": 4.443086793459125e-06, "loss": 0.5265, "step": 13750 }, { "epoch": 1.613319263688592, "grad_norm": 1.3483843803405762, "learning_rate": 4.4369624399848705e-06, "loss": 0.542, "step": 13760 }, { "epoch": 1.6144917340837144, "grad_norm": 1.3614999055862427, "learning_rate": 4.4308389419704445e-06, "loss": 0.488, "step": 13770 }, { "epoch": 1.615664204478837, "grad_norm": 1.2286779880523682, "learning_rate": 4.424716308719677e-06, "loss": 0.4871, "step": 13780 }, { "epoch": 1.6168366748739595, "grad_norm": 1.659528136253357, "learning_rate": 4.418594549535086e-06, "loss": 0.4762, "step": 13790 }, { "epoch": 1.618009145269082, "grad_norm": 1.154839038848877, "learning_rate": 4.412473673717862e-06, "loss": 0.4869, "step": 13800 }, { "epoch": 1.6191816156642045, "grad_norm": 1.6731452941894531, "learning_rate": 4.40635369056785e-06, "loss": 0.4869, "step": 13810 }, { "epoch": 1.620354086059327, "grad_norm": 1.3127319812774658, "learning_rate": 4.400234609383545e-06, "loss": 0.4823, "step": 13820 }, { "epoch": 1.6215265564544494, "grad_norm": 1.680890440940857, "learning_rate": 4.394116439462065e-06, "loss": 0.5068, "step": 13830 }, { "epoch": 1.622699026849572, "grad_norm": 1.2701610326766968, "learning_rate": 4.387999190099145e-06, "loss": 0.4812, "step": 13840 }, { "epoch": 1.6238714972446946, "grad_norm": 1.5461081266403198, "learning_rate": 4.381882870589124e-06, "loss": 0.5379, "step": 13850 }, { "epoch": 1.625043967639817, "grad_norm": 1.3125146627426147, "learning_rate": 4.375767490224929e-06, "loss": 0.5257, "step": 13860 }, { "epoch": 1.6262164380349398, "grad_norm": 1.4857646226882935, "learning_rate": 4.369653058298052e-06, "loss": 0.5551, "step": 13870 }, { "epoch": 1.6273889084300621, "grad_norm": 1.3318350315093994, "learning_rate": 4.363539584098556e-06, "loss": 0.4748, "step": 13880 }, { "epoch": 1.6285613788251847, "grad_norm": 1.353829026222229, "learning_rate": 4.357427076915039e-06, "loss": 0.4614, "step": 13890 }, { "epoch": 1.6297338492203073, "grad_norm": 1.5222121477127075, "learning_rate": 4.351315546034631e-06, "loss": 0.5191, "step": 13900 }, { "epoch": 1.6309063196154296, "grad_norm": 1.5057324171066284, "learning_rate": 4.345205000742989e-06, "loss": 0.4758, "step": 13910 }, { "epoch": 1.6320787900105522, "grad_norm": 1.2480406761169434, "learning_rate": 4.339095450324258e-06, "loss": 0.4788, "step": 13920 }, { "epoch": 1.6332512604056748, "grad_norm": 1.5111087560653687, "learning_rate": 4.332986904061078e-06, "loss": 0.45, "step": 13930 }, { "epoch": 1.6344237308007972, "grad_norm": 1.4821648597717285, "learning_rate": 4.326879371234565e-06, "loss": 0.4976, "step": 13940 }, { "epoch": 1.6355962011959198, "grad_norm": 1.1379632949829102, "learning_rate": 4.320772861124296e-06, "loss": 0.5447, "step": 13950 }, { "epoch": 1.6367686715910423, "grad_norm": 1.408944010734558, "learning_rate": 4.314667383008285e-06, "loss": 0.4667, "step": 13960 }, { "epoch": 1.6379411419861647, "grad_norm": 1.3564339876174927, "learning_rate": 4.308562946162991e-06, "loss": 0.5406, "step": 13970 }, { "epoch": 1.6391136123812875, "grad_norm": 1.3144629001617432, "learning_rate": 4.3024595598632835e-06, "loss": 0.4969, "step": 13980 }, { "epoch": 1.6402860827764099, "grad_norm": 1.3377125263214111, "learning_rate": 4.296357233382432e-06, "loss": 0.4833, "step": 13990 }, { "epoch": 1.6414585531715324, "grad_norm": 1.390093445777893, "learning_rate": 4.290255975992106e-06, "loss": 0.4716, "step": 14000 }, { "epoch": 1.6414585531715324, "eval_loss": 0.7012301087379456, "eval_model_preparation_time": 0.0, "eval_runtime": 2146.8145, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.766, "step": 14000 }, { "epoch": 1.642631023566655, "grad_norm": 1.2645139694213867, "learning_rate": 4.284155796962345e-06, "loss": 0.4456, "step": 14010 }, { "epoch": 1.6438034939617774, "grad_norm": 1.4211381673812866, "learning_rate": 4.278056705561546e-06, "loss": 0.4991, "step": 14020 }, { "epoch": 1.6449759643569, "grad_norm": 1.3785030841827393, "learning_rate": 4.271958711056463e-06, "loss": 0.4917, "step": 14030 }, { "epoch": 1.6461484347520225, "grad_norm": 1.4608776569366455, "learning_rate": 4.265861822712176e-06, "loss": 0.4415, "step": 14040 }, { "epoch": 1.647320905147145, "grad_norm": 1.2058929204940796, "learning_rate": 4.2597660497920845e-06, "loss": 0.531, "step": 14050 }, { "epoch": 1.6484933755422677, "grad_norm": 1.6772843599319458, "learning_rate": 4.2536714015578995e-06, "loss": 0.5183, "step": 14060 }, { "epoch": 1.64966584593739, "grad_norm": 1.3990840911865234, "learning_rate": 4.247577887269617e-06, "loss": 0.4958, "step": 14070 }, { "epoch": 1.6508383163325127, "grad_norm": 1.3246831893920898, "learning_rate": 4.241485516185512e-06, "loss": 0.5031, "step": 14080 }, { "epoch": 1.6520107867276352, "grad_norm": 1.3914755582809448, "learning_rate": 4.235394297562123e-06, "loss": 0.4945, "step": 14090 }, { "epoch": 1.6531832571227576, "grad_norm": 1.367789387702942, "learning_rate": 4.2293042406542385e-06, "loss": 0.4912, "step": 14100 }, { "epoch": 1.6543557275178802, "grad_norm": 1.2854039669036865, "learning_rate": 4.223215354714878e-06, "loss": 0.4516, "step": 14110 }, { "epoch": 1.6555281979130028, "grad_norm": 1.6160378456115723, "learning_rate": 4.217127648995288e-06, "loss": 0.4767, "step": 14120 }, { "epoch": 1.6567006683081251, "grad_norm": 1.288645625114441, "learning_rate": 4.211041132744915e-06, "loss": 0.4765, "step": 14130 }, { "epoch": 1.6578731387032477, "grad_norm": 1.2393230199813843, "learning_rate": 4.204955815211403e-06, "loss": 0.5155, "step": 14140 }, { "epoch": 1.6590456090983703, "grad_norm": 1.228028655052185, "learning_rate": 4.1988717056405745e-06, "loss": 0.4373, "step": 14150 }, { "epoch": 1.6602180794934926, "grad_norm": 1.5805541276931763, "learning_rate": 4.192788813276415e-06, "loss": 0.5428, "step": 14160 }, { "epoch": 1.6613905498886155, "grad_norm": 1.1964565515518188, "learning_rate": 4.186707147361059e-06, "loss": 0.4521, "step": 14170 }, { "epoch": 1.6625630202837378, "grad_norm": 1.6208343505859375, "learning_rate": 4.180626717134782e-06, "loss": 0.4687, "step": 14180 }, { "epoch": 1.6637354906788604, "grad_norm": 2.011824369430542, "learning_rate": 4.174547531835979e-06, "loss": 0.5187, "step": 14190 }, { "epoch": 1.664907961073983, "grad_norm": 1.228571891784668, "learning_rate": 4.168469600701154e-06, "loss": 0.4788, "step": 14200 }, { "epoch": 1.6660804314691053, "grad_norm": 1.1814303398132324, "learning_rate": 4.162392932964907e-06, "loss": 0.4821, "step": 14210 }, { "epoch": 1.667252901864228, "grad_norm": 1.4269133806228638, "learning_rate": 4.156317537859915e-06, "loss": 0.4984, "step": 14220 }, { "epoch": 1.6684253722593505, "grad_norm": 1.293644905090332, "learning_rate": 4.150243424616925e-06, "loss": 0.528, "step": 14230 }, { "epoch": 1.6695978426544729, "grad_norm": 1.045831322669983, "learning_rate": 4.1441706024647356e-06, "loss": 0.5242, "step": 14240 }, { "epoch": 1.6707703130495954, "grad_norm": 1.5239168405532837, "learning_rate": 4.138099080630181e-06, "loss": 0.5215, "step": 14250 }, { "epoch": 1.671942783444718, "grad_norm": 1.5626978874206543, "learning_rate": 4.132028868338123e-06, "loss": 0.5032, "step": 14260 }, { "epoch": 1.6731152538398404, "grad_norm": 1.4458396434783936, "learning_rate": 4.125959974811433e-06, "loss": 0.4843, "step": 14270 }, { "epoch": 1.6742877242349632, "grad_norm": 1.5009219646453857, "learning_rate": 4.119892409270978e-06, "loss": 0.5205, "step": 14280 }, { "epoch": 1.6754601946300856, "grad_norm": 1.4347405433654785, "learning_rate": 4.113826180935606e-06, "loss": 0.4951, "step": 14290 }, { "epoch": 1.6766326650252081, "grad_norm": 1.0906822681427002, "learning_rate": 4.107761299022137e-06, "loss": 0.4955, "step": 14300 }, { "epoch": 1.6778051354203307, "grad_norm": 1.5001709461212158, "learning_rate": 4.101697772745342e-06, "loss": 0.5054, "step": 14310 }, { "epoch": 1.678977605815453, "grad_norm": 1.3548184633255005, "learning_rate": 4.0956356113179354e-06, "loss": 0.5215, "step": 14320 }, { "epoch": 1.6801500762105757, "grad_norm": 1.4048808813095093, "learning_rate": 4.089574823950551e-06, "loss": 0.4786, "step": 14330 }, { "epoch": 1.6813225466056982, "grad_norm": 1.4869043827056885, "learning_rate": 4.083515419851748e-06, "loss": 0.4457, "step": 14340 }, { "epoch": 1.6824950170008206, "grad_norm": 1.3299458026885986, "learning_rate": 4.077457408227968e-06, "loss": 0.4852, "step": 14350 }, { "epoch": 1.6836674873959434, "grad_norm": 1.5666648149490356, "learning_rate": 4.071400798283546e-06, "loss": 0.5524, "step": 14360 }, { "epoch": 1.6848399577910658, "grad_norm": 1.5056836605072021, "learning_rate": 4.065345599220692e-06, "loss": 0.5174, "step": 14370 }, { "epoch": 1.6860124281861883, "grad_norm": 1.2978582382202148, "learning_rate": 4.05929182023946e-06, "loss": 0.4801, "step": 14380 }, { "epoch": 1.687184898581311, "grad_norm": 1.9562956094741821, "learning_rate": 4.053239470537753e-06, "loss": 0.5393, "step": 14390 }, { "epoch": 1.6883573689764333, "grad_norm": 1.1834620237350464, "learning_rate": 4.047188559311306e-06, "loss": 0.5497, "step": 14400 }, { "epoch": 1.6895298393715559, "grad_norm": 1.447541356086731, "learning_rate": 4.041139095753664e-06, "loss": 0.4676, "step": 14410 }, { "epoch": 1.6907023097666785, "grad_norm": 1.2977677583694458, "learning_rate": 4.035091089056168e-06, "loss": 0.4905, "step": 14420 }, { "epoch": 1.6918747801618008, "grad_norm": 1.4030280113220215, "learning_rate": 4.0290445484079574e-06, "loss": 0.4729, "step": 14430 }, { "epoch": 1.6930472505569234, "grad_norm": 1.3881504535675049, "learning_rate": 4.022999482995934e-06, "loss": 0.477, "step": 14440 }, { "epoch": 1.694219720952046, "grad_norm": 1.3962290287017822, "learning_rate": 4.016955902004759e-06, "loss": 0.4994, "step": 14450 }, { "epoch": 1.6953921913471683, "grad_norm": 1.5450801849365234, "learning_rate": 4.0109138146168445e-06, "loss": 0.4442, "step": 14460 }, { "epoch": 1.6965646617422911, "grad_norm": 1.3925830125808716, "learning_rate": 4.004873230012329e-06, "loss": 0.4933, "step": 14470 }, { "epoch": 1.6977371321374135, "grad_norm": 1.4856795072555542, "learning_rate": 3.998834157369066e-06, "loss": 0.4998, "step": 14480 }, { "epoch": 1.698909602532536, "grad_norm": 1.2542918920516968, "learning_rate": 3.992796605862616e-06, "loss": 0.4385, "step": 14490 }, { "epoch": 1.7000820729276587, "grad_norm": 1.596177339553833, "learning_rate": 3.986760584666226e-06, "loss": 0.4845, "step": 14500 }, { "epoch": 1.7000820729276587, "eval_loss": 0.6941739916801453, "eval_model_preparation_time": 0.0, "eval_runtime": 2147.5309, "eval_samples_per_second": 3.531, "eval_steps_per_second": 1.765, "step": 14500 }, { "epoch": 1.701254543322781, "grad_norm": 1.6662501096725464, "learning_rate": 3.980726102950816e-06, "loss": 0.5162, "step": 14510 }, { "epoch": 1.7024270137179036, "grad_norm": 1.5629969835281372, "learning_rate": 3.974693169884974e-06, "loss": 0.4782, "step": 14520 }, { "epoch": 1.7035994841130262, "grad_norm": 1.0672224760055542, "learning_rate": 3.968661794634925e-06, "loss": 0.4729, "step": 14530 }, { "epoch": 1.7047719545081486, "grad_norm": 1.5036375522613525, "learning_rate": 3.962631986364534e-06, "loss": 0.5127, "step": 14540 }, { "epoch": 1.7059444249032711, "grad_norm": 1.4320217370986938, "learning_rate": 3.956603754235285e-06, "loss": 0.5027, "step": 14550 }, { "epoch": 1.7071168952983937, "grad_norm": 1.3801813125610352, "learning_rate": 3.950577107406265e-06, "loss": 0.499, "step": 14560 }, { "epoch": 1.708289365693516, "grad_norm": 1.3937146663665771, "learning_rate": 3.944552055034151e-06, "loss": 0.5082, "step": 14570 }, { "epoch": 1.7094618360886389, "grad_norm": 1.2855660915374756, "learning_rate": 3.938528606273202e-06, "loss": 0.5027, "step": 14580 }, { "epoch": 1.7106343064837612, "grad_norm": 1.3835794925689697, "learning_rate": 3.932506770275238e-06, "loss": 0.4863, "step": 14590 }, { "epoch": 1.7118067768788838, "grad_norm": 1.7914962768554688, "learning_rate": 3.926486556189627e-06, "loss": 0.4905, "step": 14600 }, { "epoch": 1.7129792472740064, "grad_norm": 2.443002462387085, "learning_rate": 3.920467973163277e-06, "loss": 0.5086, "step": 14610 }, { "epoch": 1.7141517176691288, "grad_norm": 1.3056201934814453, "learning_rate": 3.914451030340614e-06, "loss": 0.4457, "step": 14620 }, { "epoch": 1.7153241880642514, "grad_norm": 1.3906561136245728, "learning_rate": 3.90843573686357e-06, "loss": 0.5064, "step": 14630 }, { "epoch": 1.716496658459374, "grad_norm": 1.4775606393814087, "learning_rate": 3.90242210187158e-06, "loss": 0.4487, "step": 14640 }, { "epoch": 1.7176691288544963, "grad_norm": 1.4641501903533936, "learning_rate": 3.89641013450155e-06, "loss": 0.4996, "step": 14650 }, { "epoch": 1.718841599249619, "grad_norm": 1.1139001846313477, "learning_rate": 3.890399843887854e-06, "loss": 0.4994, "step": 14660 }, { "epoch": 1.7200140696447415, "grad_norm": 1.359594702720642, "learning_rate": 3.884391239162323e-06, "loss": 0.4666, "step": 14670 }, { "epoch": 1.721186540039864, "grad_norm": 2.0527658462524414, "learning_rate": 3.878384329454223e-06, "loss": 0.4858, "step": 14680 }, { "epoch": 1.7223590104349866, "grad_norm": 1.6774019002914429, "learning_rate": 3.8723791238902445e-06, "loss": 0.501, "step": 14690 }, { "epoch": 1.723531480830109, "grad_norm": 1.4857747554779053, "learning_rate": 3.866375631594491e-06, "loss": 0.4996, "step": 14700 }, { "epoch": 1.7247039512252316, "grad_norm": 1.3636287450790405, "learning_rate": 3.860373861688459e-06, "loss": 0.5179, "step": 14710 }, { "epoch": 1.7258764216203542, "grad_norm": 1.3336663246154785, "learning_rate": 3.854373823291031e-06, "loss": 0.5009, "step": 14720 }, { "epoch": 1.7270488920154765, "grad_norm": 1.5828659534454346, "learning_rate": 3.848375525518462e-06, "loss": 0.5104, "step": 14730 }, { "epoch": 1.728221362410599, "grad_norm": 1.2788207530975342, "learning_rate": 3.842378977484353e-06, "loss": 0.4621, "step": 14740 }, { "epoch": 1.7293938328057217, "grad_norm": 1.5213638544082642, "learning_rate": 3.836384188299652e-06, "loss": 0.4706, "step": 14750 }, { "epoch": 1.730566303200844, "grad_norm": 1.4543811082839966, "learning_rate": 3.830391167072642e-06, "loss": 0.4484, "step": 14760 }, { "epoch": 1.7317387735959668, "grad_norm": 1.5175144672393799, "learning_rate": 3.824399922908906e-06, "loss": 0.5068, "step": 14770 }, { "epoch": 1.7329112439910892, "grad_norm": 1.3589184284210205, "learning_rate": 3.8184104649113325e-06, "loss": 0.4804, "step": 14780 }, { "epoch": 1.7340837143862118, "grad_norm": 1.2560317516326904, "learning_rate": 3.8124228021801025e-06, "loss": 0.4442, "step": 14790 }, { "epoch": 1.7352561847813344, "grad_norm": 1.3842132091522217, "learning_rate": 3.806436943812662e-06, "loss": 0.4897, "step": 14800 }, { "epoch": 1.7364286551764567, "grad_norm": 1.1729717254638672, "learning_rate": 3.8004528989037137e-06, "loss": 0.4292, "step": 14810 }, { "epoch": 1.7376011255715793, "grad_norm": 1.2541546821594238, "learning_rate": 3.7944706765452137e-06, "loss": 0.4758, "step": 14820 }, { "epoch": 1.738773595966702, "grad_norm": 1.5704262256622314, "learning_rate": 3.7884902858263438e-06, "loss": 0.4755, "step": 14830 }, { "epoch": 1.7399460663618243, "grad_norm": 1.4482183456420898, "learning_rate": 3.7825117358334983e-06, "loss": 0.5346, "step": 14840 }, { "epoch": 1.7411185367569468, "grad_norm": 1.4833600521087646, "learning_rate": 3.7765350356502857e-06, "loss": 0.4965, "step": 14850 }, { "epoch": 1.7422910071520694, "grad_norm": 1.5330383777618408, "learning_rate": 3.7705601943574947e-06, "loss": 0.497, "step": 14860 }, { "epoch": 1.7434634775471918, "grad_norm": 1.4671924114227295, "learning_rate": 3.7645872210330924e-06, "loss": 0.482, "step": 14870 }, { "epoch": 1.7446359479423146, "grad_norm": 1.253153681755066, "learning_rate": 3.7586161247522105e-06, "loss": 0.5097, "step": 14880 }, { "epoch": 1.745808418337437, "grad_norm": 1.773034930229187, "learning_rate": 3.752646914587125e-06, "loss": 0.5358, "step": 14890 }, { "epoch": 1.7469808887325595, "grad_norm": 1.3178410530090332, "learning_rate": 3.746679599607249e-06, "loss": 0.4762, "step": 14900 }, { "epoch": 1.748153359127682, "grad_norm": 1.1125977039337158, "learning_rate": 3.740714188879111e-06, "loss": 0.4844, "step": 14910 }, { "epoch": 1.7493258295228045, "grad_norm": 1.155515432357788, "learning_rate": 3.7347506914663545e-06, "loss": 0.5417, "step": 14920 }, { "epoch": 1.750498299917927, "grad_norm": 1.3297897577285767, "learning_rate": 3.72878911642971e-06, "loss": 0.5185, "step": 14930 }, { "epoch": 1.7516707703130496, "grad_norm": 1.4646992683410645, "learning_rate": 3.7228294728269874e-06, "loss": 0.5233, "step": 14940 }, { "epoch": 1.752843240708172, "grad_norm": 1.1954569816589355, "learning_rate": 3.716871769713066e-06, "loss": 0.5375, "step": 14950 }, { "epoch": 1.7540157111032948, "grad_norm": 1.4385180473327637, "learning_rate": 3.710916016139873e-06, "loss": 0.5037, "step": 14960 }, { "epoch": 1.7551881814984172, "grad_norm": 1.197589635848999, "learning_rate": 3.704962221156374e-06, "loss": 0.4614, "step": 14970 }, { "epoch": 1.7563606518935397, "grad_norm": 1.1038477420806885, "learning_rate": 3.6990103938085603e-06, "loss": 0.4652, "step": 14980 }, { "epoch": 1.7575331222886623, "grad_norm": 1.3697733879089355, "learning_rate": 3.693060543139433e-06, "loss": 0.5417, "step": 14990 }, { "epoch": 1.7587055926837847, "grad_norm": 1.4433447122573853, "learning_rate": 3.6871126781889886e-06, "loss": 0.4675, "step": 15000 }, { "epoch": 1.7587055926837847, "eval_loss": 0.6903113722801208, "eval_model_preparation_time": 0.0, "eval_runtime": 2150.445, "eval_samples_per_second": 3.526, "eval_steps_per_second": 1.763, "step": 15000 }, { "epoch": 1.7598780630789073, "grad_norm": 1.7080689668655396, "learning_rate": 3.68116680799421e-06, "loss": 0.5019, "step": 15010 }, { "epoch": 1.7610505334740298, "grad_norm": 1.5587687492370605, "learning_rate": 3.675222941589045e-06, "loss": 0.5526, "step": 15020 }, { "epoch": 1.7622230038691522, "grad_norm": 1.3273091316223145, "learning_rate": 3.6692810880043984e-06, "loss": 0.4686, "step": 15030 }, { "epoch": 1.7633954742642748, "grad_norm": 1.1941251754760742, "learning_rate": 3.66334125626812e-06, "loss": 0.4989, "step": 15040 }, { "epoch": 1.7645679446593974, "grad_norm": 1.4332523345947266, "learning_rate": 3.657403455404984e-06, "loss": 0.5058, "step": 15050 }, { "epoch": 1.7657404150545197, "grad_norm": 1.6299152374267578, "learning_rate": 3.6514676944366777e-06, "loss": 0.4721, "step": 15060 }, { "epoch": 1.7669128854496425, "grad_norm": 1.5166678428649902, "learning_rate": 3.6455339823817958e-06, "loss": 0.4789, "step": 15070 }, { "epoch": 1.768085355844765, "grad_norm": 1.3713551759719849, "learning_rate": 3.6396023282558122e-06, "loss": 0.5063, "step": 15080 }, { "epoch": 1.7692578262398875, "grad_norm": 1.4968557357788086, "learning_rate": 3.6336727410710772e-06, "loss": 0.4942, "step": 15090 }, { "epoch": 1.77043029663501, "grad_norm": 1.3596172332763672, "learning_rate": 3.6277452298368034e-06, "loss": 0.4949, "step": 15100 }, { "epoch": 1.7716027670301324, "grad_norm": 1.5429645776748657, "learning_rate": 3.6218198035590444e-06, "loss": 0.5143, "step": 15110 }, { "epoch": 1.772775237425255, "grad_norm": 1.301780343055725, "learning_rate": 3.6158964712406878e-06, "loss": 0.507, "step": 15120 }, { "epoch": 1.7739477078203776, "grad_norm": 1.338851809501648, "learning_rate": 3.6099752418814433e-06, "loss": 0.5031, "step": 15130 }, { "epoch": 1.7751201782155, "grad_norm": 1.6053999662399292, "learning_rate": 3.6040561244778206e-06, "loss": 0.491, "step": 15140 }, { "epoch": 1.7762926486106225, "grad_norm": 1.2752022743225098, "learning_rate": 3.5981391280231206e-06, "loss": 0.4552, "step": 15150 }, { "epoch": 1.777465119005745, "grad_norm": 1.927147388458252, "learning_rate": 3.5922242615074256e-06, "loss": 0.4933, "step": 15160 }, { "epoch": 1.7786375894008675, "grad_norm": 1.4292235374450684, "learning_rate": 3.58631153391758e-06, "loss": 0.4748, "step": 15170 }, { "epoch": 1.7798100597959903, "grad_norm": 1.6437844038009644, "learning_rate": 3.580400954237174e-06, "loss": 0.5195, "step": 15180 }, { "epoch": 1.7809825301911126, "grad_norm": 1.4947909116744995, "learning_rate": 3.574492531446544e-06, "loss": 0.4949, "step": 15190 }, { "epoch": 1.7821550005862352, "grad_norm": 1.3268496990203857, "learning_rate": 3.5685862745227404e-06, "loss": 0.5205, "step": 15200 }, { "epoch": 1.7833274709813578, "grad_norm": 1.2201547622680664, "learning_rate": 3.5626821924395237e-06, "loss": 0.4333, "step": 15210 }, { "epoch": 1.7844999413764802, "grad_norm": 1.5140401124954224, "learning_rate": 3.556780294167359e-06, "loss": 0.4906, "step": 15220 }, { "epoch": 1.7856724117716027, "grad_norm": 1.2802538871765137, "learning_rate": 3.550880588673381e-06, "loss": 0.5143, "step": 15230 }, { "epoch": 1.7868448821667253, "grad_norm": 1.5614464282989502, "learning_rate": 3.5449830849213986e-06, "loss": 0.4764, "step": 15240 }, { "epoch": 1.7880173525618477, "grad_norm": 1.3944580554962158, "learning_rate": 3.539087791871879e-06, "loss": 0.5045, "step": 15250 }, { "epoch": 1.7891898229569705, "grad_norm": 1.4763245582580566, "learning_rate": 3.5331947184819258e-06, "loss": 0.5225, "step": 15260 }, { "epoch": 1.7903622933520928, "grad_norm": 1.5548820495605469, "learning_rate": 3.5273038737052676e-06, "loss": 0.4718, "step": 15270 }, { "epoch": 1.7915347637472154, "grad_norm": 1.3862028121948242, "learning_rate": 3.5214152664922553e-06, "loss": 0.4699, "step": 15280 }, { "epoch": 1.792707234142338, "grad_norm": 1.3243000507354736, "learning_rate": 3.5155289057898347e-06, "loss": 0.4756, "step": 15290 }, { "epoch": 1.7938797045374604, "grad_norm": 1.3286547660827637, "learning_rate": 3.509644800541534e-06, "loss": 0.4446, "step": 15300 }, { "epoch": 1.795052174932583, "grad_norm": 2.2336156368255615, "learning_rate": 3.5037629596874657e-06, "loss": 0.4816, "step": 15310 }, { "epoch": 1.7962246453277055, "grad_norm": 1.3251581192016602, "learning_rate": 3.497883392164293e-06, "loss": 0.5521, "step": 15320 }, { "epoch": 1.797397115722828, "grad_norm": 1.504103183746338, "learning_rate": 3.4920061069052276e-06, "loss": 0.4984, "step": 15330 }, { "epoch": 1.7985695861179505, "grad_norm": 1.3074688911437988, "learning_rate": 3.486131112840014e-06, "loss": 0.5243, "step": 15340 }, { "epoch": 1.799742056513073, "grad_norm": 1.451684594154358, "learning_rate": 3.4802584188949175e-06, "loss": 0.4733, "step": 15350 }, { "epoch": 1.8009145269081954, "grad_norm": 1.3025407791137695, "learning_rate": 3.474388033992702e-06, "loss": 0.5077, "step": 15360 }, { "epoch": 1.8020869973033182, "grad_norm": 1.4639246463775635, "learning_rate": 3.4685199670526317e-06, "loss": 0.5119, "step": 15370 }, { "epoch": 1.8032594676984406, "grad_norm": 1.5355656147003174, "learning_rate": 3.4626542269904433e-06, "loss": 0.4827, "step": 15380 }, { "epoch": 1.8044319380935632, "grad_norm": 1.5206661224365234, "learning_rate": 3.456790822718339e-06, "loss": 0.4765, "step": 15390 }, { "epoch": 1.8056044084886858, "grad_norm": 1.3644640445709229, "learning_rate": 3.4509297631449747e-06, "loss": 0.4544, "step": 15400 }, { "epoch": 1.8067768788838081, "grad_norm": 1.86026930809021, "learning_rate": 3.4450710571754398e-06, "loss": 0.47, "step": 15410 }, { "epoch": 1.8079493492789307, "grad_norm": 1.0519914627075195, "learning_rate": 3.439214713711251e-06, "loss": 0.5049, "step": 15420 }, { "epoch": 1.8091218196740533, "grad_norm": 1.0633872747421265, "learning_rate": 3.4333607416503343e-06, "loss": 0.4681, "step": 15430 }, { "epoch": 1.8102942900691756, "grad_norm": 1.3743951320648193, "learning_rate": 3.427509149887013e-06, "loss": 0.5218, "step": 15440 }, { "epoch": 1.8114667604642984, "grad_norm": 1.2745856046676636, "learning_rate": 3.4216599473119926e-06, "loss": 0.4586, "step": 15450 }, { "epoch": 1.8126392308594208, "grad_norm": 1.669543981552124, "learning_rate": 3.4158131428123508e-06, "loss": 0.4834, "step": 15460 }, { "epoch": 1.8138117012545432, "grad_norm": 1.400428295135498, "learning_rate": 3.40996874527152e-06, "loss": 0.5175, "step": 15470 }, { "epoch": 1.814984171649666, "grad_norm": 1.518678903579712, "learning_rate": 3.4041267635692747e-06, "loss": 0.5681, "step": 15480 }, { "epoch": 1.8161566420447883, "grad_norm": 1.4416635036468506, "learning_rate": 3.398287206581723e-06, "loss": 0.4808, "step": 15490 }, { "epoch": 1.817329112439911, "grad_norm": 1.4627622365951538, "learning_rate": 3.3924500831812833e-06, "loss": 0.4962, "step": 15500 }, { "epoch": 1.817329112439911, "eval_loss": 0.6897906064987183, "eval_model_preparation_time": 0.0, "eval_runtime": 2149.3054, "eval_samples_per_second": 3.528, "eval_steps_per_second": 1.764, "step": 15500 }, { "epoch": 1.8185015828350335, "grad_norm": 1.1792484521865845, "learning_rate": 3.3866154022366806e-06, "loss": 0.4955, "step": 15510 }, { "epoch": 1.8196740532301559, "grad_norm": 1.6259822845458984, "learning_rate": 3.380783172612928e-06, "loss": 0.5063, "step": 15520 }, { "epoch": 1.8208465236252784, "grad_norm": 1.5383827686309814, "learning_rate": 3.374953403171313e-06, "loss": 0.544, "step": 15530 }, { "epoch": 1.822018994020401, "grad_norm": 1.4813141822814941, "learning_rate": 3.3691261027693872e-06, "loss": 0.5131, "step": 15540 }, { "epoch": 1.8231914644155234, "grad_norm": 1.6500073671340942, "learning_rate": 3.3633012802609476e-06, "loss": 0.4757, "step": 15550 }, { "epoch": 1.8243639348106462, "grad_norm": 1.4000877141952515, "learning_rate": 3.3574789444960304e-06, "loss": 0.4554, "step": 15560 }, { "epoch": 1.8255364052057685, "grad_norm": 1.5220354795455933, "learning_rate": 3.351659104320891e-06, "loss": 0.5061, "step": 15570 }, { "epoch": 1.8267088756008911, "grad_norm": 1.2705525159835815, "learning_rate": 3.345841768577992e-06, "loss": 0.4152, "step": 15580 }, { "epoch": 1.8278813459960137, "grad_norm": 1.375255823135376, "learning_rate": 3.3400269461059955e-06, "loss": 0.4892, "step": 15590 }, { "epoch": 1.829053816391136, "grad_norm": 1.5005638599395752, "learning_rate": 3.3342146457397396e-06, "loss": 0.474, "step": 15600 }, { "epoch": 1.8302262867862586, "grad_norm": 1.5239906311035156, "learning_rate": 3.3284048763102327e-06, "loss": 0.471, "step": 15610 }, { "epoch": 1.8313987571813812, "grad_norm": 1.7793023586273193, "learning_rate": 3.32259764664464e-06, "loss": 0.5694, "step": 15620 }, { "epoch": 1.8325712275765036, "grad_norm": 1.2571715116500854, "learning_rate": 3.3167929655662635e-06, "loss": 0.4381, "step": 15630 }, { "epoch": 1.8337436979716262, "grad_norm": 1.2139416933059692, "learning_rate": 3.310990841894535e-06, "loss": 0.5099, "step": 15640 }, { "epoch": 1.8349161683667488, "grad_norm": 1.7435628175735474, "learning_rate": 3.305191284445004e-06, "loss": 0.4806, "step": 15650 }, { "epoch": 1.8360886387618711, "grad_norm": 1.3920245170593262, "learning_rate": 3.299394302029315e-06, "loss": 0.5256, "step": 15660 }, { "epoch": 1.837261109156994, "grad_norm": 1.3510903120040894, "learning_rate": 3.2935999034552026e-06, "loss": 0.5209, "step": 15670 }, { "epoch": 1.8384335795521163, "grad_norm": 1.3804148435592651, "learning_rate": 3.2878080975264794e-06, "loss": 0.4621, "step": 15680 }, { "epoch": 1.8396060499472389, "grad_norm": 1.303078055381775, "learning_rate": 3.282018893043012e-06, "loss": 0.5158, "step": 15690 }, { "epoch": 1.8407785203423614, "grad_norm": 1.317586064338684, "learning_rate": 3.276232298800717e-06, "loss": 0.4324, "step": 15700 }, { "epoch": 1.8419509907374838, "grad_norm": 1.394755244255066, "learning_rate": 3.270448323591549e-06, "loss": 0.4795, "step": 15710 }, { "epoch": 1.8431234611326064, "grad_norm": 1.3865035772323608, "learning_rate": 3.2646669762034805e-06, "loss": 0.4867, "step": 15720 }, { "epoch": 1.844295931527729, "grad_norm": 1.5968260765075684, "learning_rate": 3.258888265420486e-06, "loss": 0.5158, "step": 15730 }, { "epoch": 1.8454684019228513, "grad_norm": 1.2253371477127075, "learning_rate": 3.253112200022545e-06, "loss": 0.5714, "step": 15740 }, { "epoch": 1.8466408723179741, "grad_norm": 1.4568933248519897, "learning_rate": 3.2473387887856107e-06, "loss": 0.4853, "step": 15750 }, { "epoch": 1.8478133427130965, "grad_norm": 1.6363853216171265, "learning_rate": 3.2415680404815998e-06, "loss": 0.5149, "step": 15760 }, { "epoch": 1.8489858131082189, "grad_norm": 1.2949144840240479, "learning_rate": 3.2357999638783943e-06, "loss": 0.4955, "step": 15770 }, { "epoch": 1.8501582835033417, "grad_norm": 1.3149899244308472, "learning_rate": 3.230034567739808e-06, "loss": 0.5175, "step": 15780 }, { "epoch": 1.851330753898464, "grad_norm": 1.2452431917190552, "learning_rate": 3.2242718608255837e-06, "loss": 0.4716, "step": 15790 }, { "epoch": 1.8525032242935866, "grad_norm": 1.3920133113861084, "learning_rate": 3.2185118518913816e-06, "loss": 0.4634, "step": 15800 }, { "epoch": 1.8536756946887092, "grad_norm": 1.3417644500732422, "learning_rate": 3.2127545496887587e-06, "loss": 0.5088, "step": 15810 }, { "epoch": 1.8548481650838315, "grad_norm": 3.3994052410125732, "learning_rate": 3.2069999629651606e-06, "loss": 0.4628, "step": 15820 }, { "epoch": 1.8560206354789541, "grad_norm": 1.3495444059371948, "learning_rate": 3.20124810046391e-06, "loss": 0.4968, "step": 15830 }, { "epoch": 1.8571931058740767, "grad_norm": 1.3561158180236816, "learning_rate": 3.1954989709241867e-06, "loss": 0.4782, "step": 15840 }, { "epoch": 1.858365576269199, "grad_norm": 1.2995635271072388, "learning_rate": 3.1897525830810195e-06, "loss": 0.4751, "step": 15850 }, { "epoch": 1.8595380466643219, "grad_norm": 1.6011836528778076, "learning_rate": 3.1840089456652725e-06, "loss": 0.4805, "step": 15860 }, { "epoch": 1.8607105170594442, "grad_norm": 1.3277904987335205, "learning_rate": 3.1782680674036294e-06, "loss": 0.5149, "step": 15870 }, { "epoch": 1.8618829874545668, "grad_norm": 1.3419189453125, "learning_rate": 3.172529957018582e-06, "loss": 0.5112, "step": 15880 }, { "epoch": 1.8630554578496894, "grad_norm": 1.432569146156311, "learning_rate": 3.1667946232284185e-06, "loss": 0.4862, "step": 15890 }, { "epoch": 1.8642279282448118, "grad_norm": 1.0708355903625488, "learning_rate": 3.161062074747206e-06, "loss": 0.4874, "step": 15900 }, { "epoch": 1.8654003986399343, "grad_norm": 1.5412769317626953, "learning_rate": 3.1553323202847797e-06, "loss": 0.4602, "step": 15910 }, { "epoch": 1.866572869035057, "grad_norm": 1.4399983882904053, "learning_rate": 3.149605368546733e-06, "loss": 0.436, "step": 15920 }, { "epoch": 1.8677453394301793, "grad_norm": 1.218432903289795, "learning_rate": 3.143881228234398e-06, "loss": 0.5176, "step": 15930 }, { "epoch": 1.8689178098253019, "grad_norm": 1.5745799541473389, "learning_rate": 3.138159908044834e-06, "loss": 0.4827, "step": 15940 }, { "epoch": 1.8700902802204245, "grad_norm": 1.3244707584381104, "learning_rate": 3.1324414166708193e-06, "loss": 0.491, "step": 15950 }, { "epoch": 1.8712627506155468, "grad_norm": 1.7130377292633057, "learning_rate": 3.1267257628008325e-06, "loss": 0.523, "step": 15960 }, { "epoch": 1.8724352210106696, "grad_norm": 1.5086690187454224, "learning_rate": 3.121012955119038e-06, "loss": 0.5199, "step": 15970 }, { "epoch": 1.873607691405792, "grad_norm": 1.1464070081710815, "learning_rate": 3.1153030023052823e-06, "loss": 0.4791, "step": 15980 }, { "epoch": 1.8747801618009146, "grad_norm": 2.1236631870269775, "learning_rate": 3.109595913035068e-06, "loss": 0.4773, "step": 15990 }, { "epoch": 1.8759526321960371, "grad_norm": 1.3252651691436768, "learning_rate": 3.1038916959795484e-06, "loss": 0.5225, "step": 16000 }, { "epoch": 1.8759526321960371, "eval_loss": 0.6860724091529846, "eval_model_preparation_time": 0.0, "eval_runtime": 2155.0856, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.759, "step": 16000 }, { "epoch": 1.8771251025911595, "grad_norm": 1.3589649200439453, "learning_rate": 3.098190359805519e-06, "loss": 0.4132, "step": 16010 }, { "epoch": 1.878297572986282, "grad_norm": 1.1424484252929688, "learning_rate": 3.092491913175388e-06, "loss": 0.5216, "step": 16020 }, { "epoch": 1.8794700433814047, "grad_norm": 1.5989916324615479, "learning_rate": 3.0867963647471794e-06, "loss": 0.5016, "step": 16030 }, { "epoch": 1.880642513776527, "grad_norm": 1.7132763862609863, "learning_rate": 3.0811037231745156e-06, "loss": 0.5127, "step": 16040 }, { "epoch": 1.8818149841716498, "grad_norm": 1.4114233255386353, "learning_rate": 3.075413997106598e-06, "loss": 0.4412, "step": 16050 }, { "epoch": 1.8829874545667722, "grad_norm": 1.6855621337890625, "learning_rate": 3.0697271951881964e-06, "loss": 0.4941, "step": 16060 }, { "epoch": 1.8841599249618945, "grad_norm": 1.5143619775772095, "learning_rate": 3.064043326059648e-06, "loss": 0.501, "step": 16070 }, { "epoch": 1.8853323953570174, "grad_norm": 1.1024597883224487, "learning_rate": 3.0583623983568224e-06, "loss": 0.4548, "step": 16080 }, { "epoch": 1.8865048657521397, "grad_norm": 1.4546362161636353, "learning_rate": 3.052684420711125e-06, "loss": 0.4864, "step": 16090 }, { "epoch": 1.8876773361472623, "grad_norm": 1.3422895669937134, "learning_rate": 3.047009401749482e-06, "loss": 0.4209, "step": 16100 }, { "epoch": 1.8888498065423849, "grad_norm": 1.396811842918396, "learning_rate": 3.041337350094321e-06, "loss": 0.4941, "step": 16110 }, { "epoch": 1.8900222769375072, "grad_norm": 1.5278092622756958, "learning_rate": 3.035668274363557e-06, "loss": 0.4414, "step": 16120 }, { "epoch": 1.8911947473326298, "grad_norm": 1.4436333179473877, "learning_rate": 3.0300021831705926e-06, "loss": 0.4688, "step": 16130 }, { "epoch": 1.8923672177277524, "grad_norm": 1.5010651350021362, "learning_rate": 3.024339085124291e-06, "loss": 0.4701, "step": 16140 }, { "epoch": 1.8935396881228748, "grad_norm": 1.2057087421417236, "learning_rate": 3.018678988828963e-06, "loss": 0.481, "step": 16150 }, { "epoch": 1.8947121585179976, "grad_norm": 1.289288878440857, "learning_rate": 3.0130219028843687e-06, "loss": 0.4994, "step": 16160 }, { "epoch": 1.89588462891312, "grad_norm": 1.3407689332962036, "learning_rate": 3.0073678358856882e-06, "loss": 0.5089, "step": 16170 }, { "epoch": 1.8970570993082425, "grad_norm": 1.392730474472046, "learning_rate": 3.001716796423515e-06, "loss": 0.4486, "step": 16180 }, { "epoch": 1.898229569703365, "grad_norm": 1.870256781578064, "learning_rate": 2.9960687930838408e-06, "loss": 0.4954, "step": 16190 }, { "epoch": 1.8994020400984875, "grad_norm": 1.3336708545684814, "learning_rate": 2.99042383444805e-06, "loss": 0.4769, "step": 16200 }, { "epoch": 1.90057451049361, "grad_norm": 1.5791990756988525, "learning_rate": 2.9847819290928994e-06, "loss": 0.4945, "step": 16210 }, { "epoch": 1.9017469808887326, "grad_norm": 1.1143450736999512, "learning_rate": 2.9791430855904983e-06, "loss": 0.4166, "step": 16220 }, { "epoch": 1.902919451283855, "grad_norm": 1.432306170463562, "learning_rate": 2.973507312508318e-06, "loss": 0.4762, "step": 16230 }, { "epoch": 1.9040919216789776, "grad_norm": 1.2897073030471802, "learning_rate": 2.967874618409155e-06, "loss": 0.4646, "step": 16240 }, { "epoch": 1.9052643920741001, "grad_norm": 1.7369928359985352, "learning_rate": 2.9622450118511293e-06, "loss": 0.5208, "step": 16250 }, { "epoch": 1.9064368624692225, "grad_norm": 1.3651738166809082, "learning_rate": 2.956618501387671e-06, "loss": 0.5272, "step": 16260 }, { "epoch": 1.9076093328643453, "grad_norm": 1.3422071933746338, "learning_rate": 2.950995095567508e-06, "loss": 0.4735, "step": 16270 }, { "epoch": 1.9087818032594677, "grad_norm": 1.3274378776550293, "learning_rate": 2.945374802934645e-06, "loss": 0.5106, "step": 16280 }, { "epoch": 1.9099542736545903, "grad_norm": 1.2794208526611328, "learning_rate": 2.939757632028365e-06, "loss": 0.5035, "step": 16290 }, { "epoch": 1.9111267440497128, "grad_norm": 1.5482145547866821, "learning_rate": 2.9341435913832005e-06, "loss": 0.5103, "step": 16300 }, { "epoch": 1.9122992144448352, "grad_norm": 1.0174283981323242, "learning_rate": 2.9285326895289305e-06, "loss": 0.4558, "step": 16310 }, { "epoch": 1.9134716848399578, "grad_norm": 1.6720904111862183, "learning_rate": 2.9229249349905686e-06, "loss": 0.4432, "step": 16320 }, { "epoch": 1.9146441552350804, "grad_norm": 1.317158818244934, "learning_rate": 2.9173203362883424e-06, "loss": 0.4489, "step": 16330 }, { "epoch": 1.9158166256302027, "grad_norm": 1.416707158088684, "learning_rate": 2.911718901937682e-06, "loss": 0.5142, "step": 16340 }, { "epoch": 1.9169890960253255, "grad_norm": 1.3653700351715088, "learning_rate": 2.9061206404492183e-06, "loss": 0.5132, "step": 16350 }, { "epoch": 1.9181615664204479, "grad_norm": 1.3466061353683472, "learning_rate": 2.9005255603287542e-06, "loss": 0.4824, "step": 16360 }, { "epoch": 1.9193340368155702, "grad_norm": 1.270234227180481, "learning_rate": 2.894933670077259e-06, "loss": 0.4809, "step": 16370 }, { "epoch": 1.920506507210693, "grad_norm": 1.4555280208587646, "learning_rate": 2.8893449781908635e-06, "loss": 0.4836, "step": 16380 }, { "epoch": 1.9216789776058154, "grad_norm": 1.1721596717834473, "learning_rate": 2.8837594931608305e-06, "loss": 0.4345, "step": 16390 }, { "epoch": 1.922851448000938, "grad_norm": 1.2154380083084106, "learning_rate": 2.8781772234735493e-06, "loss": 0.4945, "step": 16400 }, { "epoch": 1.9240239183960606, "grad_norm": 1.6090389490127563, "learning_rate": 2.8725981776105328e-06, "loss": 0.4748, "step": 16410 }, { "epoch": 1.925196388791183, "grad_norm": 1.387230634689331, "learning_rate": 2.8670223640483885e-06, "loss": 0.4501, "step": 16420 }, { "epoch": 1.9263688591863055, "grad_norm": 1.4377638101577759, "learning_rate": 2.8614497912588136e-06, "loss": 0.4862, "step": 16430 }, { "epoch": 1.927541329581428, "grad_norm": 1.3871772289276123, "learning_rate": 2.855880467708586e-06, "loss": 0.4889, "step": 16440 }, { "epoch": 1.9287137999765505, "grad_norm": 1.6850452423095703, "learning_rate": 2.850314401859542e-06, "loss": 0.4749, "step": 16450 }, { "epoch": 1.9298862703716733, "grad_norm": 1.3124243021011353, "learning_rate": 2.8447516021685696e-06, "loss": 0.5354, "step": 16460 }, { "epoch": 1.9310587407667956, "grad_norm": 1.4865137338638306, "learning_rate": 2.8391920770875957e-06, "loss": 0.5146, "step": 16470 }, { "epoch": 1.9322312111619182, "grad_norm": 1.3306686878204346, "learning_rate": 2.833635835063569e-06, "loss": 0.4743, "step": 16480 }, { "epoch": 1.9334036815570408, "grad_norm": 1.5863094329833984, "learning_rate": 2.828082884538451e-06, "loss": 0.5065, "step": 16490 }, { "epoch": 1.9345761519521631, "grad_norm": 1.4328006505966187, "learning_rate": 2.8225332339492065e-06, "loss": 0.5717, "step": 16500 }, { "epoch": 1.9345761519521631, "eval_loss": 0.6823179125785828, "eval_model_preparation_time": 0.0, "eval_runtime": 2149.1994, "eval_samples_per_second": 3.528, "eval_steps_per_second": 1.764, "step": 16500 }, { "epoch": 1.9357486223472857, "grad_norm": 1.2676780223846436, "learning_rate": 2.8169868917277816e-06, "loss": 0.4777, "step": 16510 }, { "epoch": 1.9369210927424083, "grad_norm": 1.394472360610962, "learning_rate": 2.811443866301097e-06, "loss": 0.4508, "step": 16520 }, { "epoch": 1.9380935631375307, "grad_norm": 1.519976019859314, "learning_rate": 2.8059041660910325e-06, "loss": 0.5056, "step": 16530 }, { "epoch": 1.9392660335326533, "grad_norm": 1.2431764602661133, "learning_rate": 2.8003677995144197e-06, "loss": 0.4706, "step": 16540 }, { "epoch": 1.9404385039277758, "grad_norm": 1.2638840675354004, "learning_rate": 2.7948347749830195e-06, "loss": 0.4799, "step": 16550 }, { "epoch": 1.9416109743228982, "grad_norm": 1.3755587339401245, "learning_rate": 2.789305100903522e-06, "loss": 0.518, "step": 16560 }, { "epoch": 1.942783444718021, "grad_norm": 1.4328900575637817, "learning_rate": 2.783778785677522e-06, "loss": 0.4581, "step": 16570 }, { "epoch": 1.9439559151131434, "grad_norm": 1.4385415315628052, "learning_rate": 2.778255837701512e-06, "loss": 0.5063, "step": 16580 }, { "epoch": 1.945128385508266, "grad_norm": 1.2607309818267822, "learning_rate": 2.772736265366869e-06, "loss": 0.4975, "step": 16590 }, { "epoch": 1.9463008559033885, "grad_norm": 1.2885854244232178, "learning_rate": 2.7672200770598385e-06, "loss": 0.5134, "step": 16600 }, { "epoch": 1.9474733262985109, "grad_norm": 1.1373357772827148, "learning_rate": 2.7617072811615264e-06, "loss": 0.5154, "step": 16610 }, { "epoch": 1.9486457966936335, "grad_norm": 1.6354396343231201, "learning_rate": 2.756197886047887e-06, "loss": 0.4856, "step": 16620 }, { "epoch": 1.949818267088756, "grad_norm": 1.7963088750839233, "learning_rate": 2.7506919000897046e-06, "loss": 0.4673, "step": 16630 }, { "epoch": 1.9509907374838784, "grad_norm": 1.624948263168335, "learning_rate": 2.7451893316525833e-06, "loss": 0.4895, "step": 16640 }, { "epoch": 1.9521632078790012, "grad_norm": 1.5189015865325928, "learning_rate": 2.739690189096936e-06, "loss": 0.4764, "step": 16650 }, { "epoch": 1.9533356782741236, "grad_norm": 1.4028252363204956, "learning_rate": 2.734194480777969e-06, "loss": 0.4333, "step": 16660 }, { "epoch": 1.954508148669246, "grad_norm": 1.5875906944274902, "learning_rate": 2.728702215045671e-06, "loss": 0.4739, "step": 16670 }, { "epoch": 1.9556806190643687, "grad_norm": 1.3203349113464355, "learning_rate": 2.723213400244804e-06, "loss": 0.4758, "step": 16680 }, { "epoch": 1.956853089459491, "grad_norm": 1.3141456842422485, "learning_rate": 2.7177280447148826e-06, "loss": 0.4917, "step": 16690 }, { "epoch": 1.9580255598546137, "grad_norm": 1.8169490098953247, "learning_rate": 2.7122461567901638e-06, "loss": 0.4943, "step": 16700 }, { "epoch": 1.9591980302497363, "grad_norm": 1.1625810861587524, "learning_rate": 2.706767744799645e-06, "loss": 0.4649, "step": 16710 }, { "epoch": 1.9603705006448586, "grad_norm": 1.3436334133148193, "learning_rate": 2.7012928170670303e-06, "loss": 0.4772, "step": 16720 }, { "epoch": 1.9615429710399812, "grad_norm": 1.3879536390304565, "learning_rate": 2.695821381910736e-06, "loss": 0.4851, "step": 16730 }, { "epoch": 1.9627154414351038, "grad_norm": 1.3528167009353638, "learning_rate": 2.6903534476438754e-06, "loss": 0.4809, "step": 16740 }, { "epoch": 1.9638879118302262, "grad_norm": 1.210437536239624, "learning_rate": 2.6848890225742376e-06, "loss": 0.494, "step": 16750 }, { "epoch": 1.965060382225349, "grad_norm": 1.623511791229248, "learning_rate": 2.679428115004279e-06, "loss": 0.4955, "step": 16760 }, { "epoch": 1.9662328526204713, "grad_norm": 1.2305043935775757, "learning_rate": 2.6739707332311178e-06, "loss": 0.4986, "step": 16770 }, { "epoch": 1.967405323015594, "grad_norm": 1.3504571914672852, "learning_rate": 2.6685168855465117e-06, "loss": 0.4337, "step": 16780 }, { "epoch": 1.9685777934107165, "grad_norm": 1.8431240320205688, "learning_rate": 2.6630665802368426e-06, "loss": 0.4961, "step": 16790 }, { "epoch": 1.9697502638058388, "grad_norm": 1.2100582122802734, "learning_rate": 2.657619825583122e-06, "loss": 0.4977, "step": 16800 }, { "epoch": 1.9709227342009614, "grad_norm": 1.227144479751587, "learning_rate": 2.6521766298609586e-06, "loss": 0.4468, "step": 16810 }, { "epoch": 1.972095204596084, "grad_norm": 1.225894570350647, "learning_rate": 2.646737001340557e-06, "loss": 0.4963, "step": 16820 }, { "epoch": 1.9732676749912064, "grad_norm": 1.2316418886184692, "learning_rate": 2.641300948286698e-06, "loss": 0.4726, "step": 16830 }, { "epoch": 1.974440145386329, "grad_norm": 1.567684531211853, "learning_rate": 2.6358684789587374e-06, "loss": 0.4559, "step": 16840 }, { "epoch": 1.9756126157814515, "grad_norm": 1.1368367671966553, "learning_rate": 2.630439601610581e-06, "loss": 0.4842, "step": 16850 }, { "epoch": 1.976785086176574, "grad_norm": 1.3253400325775146, "learning_rate": 2.6250143244906724e-06, "loss": 0.4812, "step": 16860 }, { "epoch": 1.9779575565716967, "grad_norm": 1.478934645652771, "learning_rate": 2.619592655841996e-06, "loss": 0.4641, "step": 16870 }, { "epoch": 1.979130026966819, "grad_norm": 1.2328530550003052, "learning_rate": 2.6141746039020454e-06, "loss": 0.4795, "step": 16880 }, { "epoch": 1.9803024973619416, "grad_norm": 1.7381224632263184, "learning_rate": 2.6087601769028205e-06, "loss": 0.5119, "step": 16890 }, { "epoch": 1.9814749677570642, "grad_norm": 1.6499733924865723, "learning_rate": 2.6033493830708177e-06, "loss": 0.5105, "step": 16900 }, { "epoch": 1.9826474381521866, "grad_norm": 1.421966791152954, "learning_rate": 2.597942230627009e-06, "loss": 0.5092, "step": 16910 }, { "epoch": 1.9838199085473092, "grad_norm": 1.2215737104415894, "learning_rate": 2.5925387277868353e-06, "loss": 0.4561, "step": 16920 }, { "epoch": 1.9849923789424317, "grad_norm": 1.5399583578109741, "learning_rate": 2.5871388827601917e-06, "loss": 0.4866, "step": 16930 }, { "epoch": 1.986164849337554, "grad_norm": 1.1787561178207397, "learning_rate": 2.581742703751417e-06, "loss": 0.4603, "step": 16940 }, { "epoch": 1.987337319732677, "grad_norm": 1.2946648597717285, "learning_rate": 2.5763501989592767e-06, "loss": 0.4468, "step": 16950 }, { "epoch": 1.9885097901277993, "grad_norm": 1.7168962955474854, "learning_rate": 2.5709613765769604e-06, "loss": 0.4703, "step": 16960 }, { "epoch": 1.9896822605229216, "grad_norm": 1.2824171781539917, "learning_rate": 2.565576244792057e-06, "loss": 0.4631, "step": 16970 }, { "epoch": 1.9908547309180444, "grad_norm": 1.4698327779769897, "learning_rate": 2.5601948117865493e-06, "loss": 0.4969, "step": 16980 }, { "epoch": 1.9920272013131668, "grad_norm": 1.3583250045776367, "learning_rate": 2.5548170857368015e-06, "loss": 0.4667, "step": 16990 }, { "epoch": 1.9931996717082894, "grad_norm": 1.431276798248291, "learning_rate": 2.5494430748135445e-06, "loss": 0.4773, "step": 17000 }, { "epoch": 1.9931996717082894, "eval_loss": 0.6805582046508789, "eval_model_preparation_time": 0.0, "eval_runtime": 2148.7867, "eval_samples_per_second": 3.529, "eval_steps_per_second": 1.764, "step": 17000 }, { "epoch": 1.994372142103412, "grad_norm": 1.2633074522018433, "learning_rate": 2.5440727871818626e-06, "loss": 0.5075, "step": 17010 }, { "epoch": 1.9955446124985343, "grad_norm": 1.4585095643997192, "learning_rate": 2.5387062310011903e-06, "loss": 0.5021, "step": 17020 }, { "epoch": 1.996717082893657, "grad_norm": 1.4468519687652588, "learning_rate": 2.5333434144252843e-06, "loss": 0.4447, "step": 17030 }, { "epoch": 1.9978895532887795, "grad_norm": 1.6242985725402832, "learning_rate": 2.527984345602225e-06, "loss": 0.4679, "step": 17040 }, { "epoch": 1.9990620236839018, "grad_norm": 1.36859929561615, "learning_rate": 2.522629032674395e-06, "loss": 0.4789, "step": 17050 }, { "epoch": 2.0002344940790246, "grad_norm": 1.579290509223938, "learning_rate": 2.5172774837784717e-06, "loss": 0.4856, "step": 17060 }, { "epoch": 2.001406964474147, "grad_norm": 1.2946696281433105, "learning_rate": 2.5119297070454127e-06, "loss": 0.3519, "step": 17070 }, { "epoch": 2.0025794348692694, "grad_norm": 1.8592146635055542, "learning_rate": 2.506585710600449e-06, "loss": 0.3344, "step": 17080 }, { "epoch": 2.003751905264392, "grad_norm": 1.5873470306396484, "learning_rate": 2.501245502563062e-06, "loss": 0.3172, "step": 17090 }, { "epoch": 2.0049243756595145, "grad_norm": 1.3384560346603394, "learning_rate": 2.4959090910469796e-06, "loss": 0.3235, "step": 17100 }, { "epoch": 2.0060968460546373, "grad_norm": 1.5928928852081299, "learning_rate": 2.490576484160161e-06, "loss": 0.3508, "step": 17110 }, { "epoch": 2.0072693164497597, "grad_norm": 1.3109548091888428, "learning_rate": 2.4852476900047852e-06, "loss": 0.3422, "step": 17120 }, { "epoch": 2.008441786844882, "grad_norm": 1.3600844144821167, "learning_rate": 2.479922716677236e-06, "loss": 0.3427, "step": 17130 }, { "epoch": 2.009614257240005, "grad_norm": 1.5016018152236938, "learning_rate": 2.4746015722680975e-06, "loss": 0.3308, "step": 17140 }, { "epoch": 2.0107867276351272, "grad_norm": 1.934796690940857, "learning_rate": 2.4692842648621316e-06, "loss": 0.3379, "step": 17150 }, { "epoch": 2.0119591980302496, "grad_norm": 1.4923080205917358, "learning_rate": 2.4639708025382697e-06, "loss": 0.3462, "step": 17160 }, { "epoch": 2.0131316684253724, "grad_norm": 1.3157778978347778, "learning_rate": 2.4586611933696075e-06, "loss": 0.3092, "step": 17170 }, { "epoch": 2.0143041388204947, "grad_norm": 1.7378487586975098, "learning_rate": 2.4533554454233777e-06, "loss": 0.3214, "step": 17180 }, { "epoch": 2.015476609215617, "grad_norm": 1.602013349533081, "learning_rate": 2.4480535667609496e-06, "loss": 0.2881, "step": 17190 }, { "epoch": 2.01664907961074, "grad_norm": 1.4190000295639038, "learning_rate": 2.4427555654378184e-06, "loss": 0.3152, "step": 17200 }, { "epoch": 2.0178215500058623, "grad_norm": 1.5184204578399658, "learning_rate": 2.4374614495035825e-06, "loss": 0.3231, "step": 17210 }, { "epoch": 2.018994020400985, "grad_norm": 1.144080638885498, "learning_rate": 2.4321712270019373e-06, "loss": 0.3008, "step": 17220 }, { "epoch": 2.0201664907961074, "grad_norm": 1.5121253728866577, "learning_rate": 2.4268849059706673e-06, "loss": 0.3057, "step": 17230 }, { "epoch": 2.02133896119123, "grad_norm": 1.3726460933685303, "learning_rate": 2.421602494441627e-06, "loss": 0.3112, "step": 17240 }, { "epoch": 2.0225114315863526, "grad_norm": 1.9195501804351807, "learning_rate": 2.416324000440724e-06, "loss": 0.3388, "step": 17250 }, { "epoch": 2.023683901981475, "grad_norm": 1.8971065282821655, "learning_rate": 2.4110494319879256e-06, "loss": 0.3344, "step": 17260 }, { "epoch": 2.0248563723765973, "grad_norm": 1.3542076349258423, "learning_rate": 2.4057787970972266e-06, "loss": 0.2942, "step": 17270 }, { "epoch": 2.02602884277172, "grad_norm": 1.8399713039398193, "learning_rate": 2.4005121037766467e-06, "loss": 0.3024, "step": 17280 }, { "epoch": 2.0272013131668425, "grad_norm": 1.395883321762085, "learning_rate": 2.3952493600282213e-06, "loss": 0.2846, "step": 17290 }, { "epoch": 2.028373783561965, "grad_norm": 1.4793384075164795, "learning_rate": 2.389990573847979e-06, "loss": 0.3059, "step": 17300 }, { "epoch": 2.0295462539570877, "grad_norm": 1.3387218713760376, "learning_rate": 2.3847357532259386e-06, "loss": 0.3287, "step": 17310 }, { "epoch": 2.03071872435221, "grad_norm": 1.6271189451217651, "learning_rate": 2.3794849061460928e-06, "loss": 0.3331, "step": 17320 }, { "epoch": 2.031891194747333, "grad_norm": 1.2827647924423218, "learning_rate": 2.3742380405863975e-06, "loss": 0.3225, "step": 17330 }, { "epoch": 2.033063665142455, "grad_norm": 1.7314237356185913, "learning_rate": 2.3689951645187576e-06, "loss": 0.3288, "step": 17340 }, { "epoch": 2.0342361355375775, "grad_norm": 1.3252629041671753, "learning_rate": 2.3637562859090208e-06, "loss": 0.3265, "step": 17350 }, { "epoch": 2.0354086059327003, "grad_norm": 1.6964740753173828, "learning_rate": 2.358521412716957e-06, "loss": 0.3256, "step": 17360 }, { "epoch": 2.0365810763278227, "grad_norm": 1.5427204370498657, "learning_rate": 2.353290552896252e-06, "loss": 0.2925, "step": 17370 }, { "epoch": 2.037753546722945, "grad_norm": 1.5068695545196533, "learning_rate": 2.3480637143944936e-06, "loss": 0.3455, "step": 17380 }, { "epoch": 2.038926017118068, "grad_norm": 1.3651807308197021, "learning_rate": 2.342840905153159e-06, "loss": 0.3392, "step": 17390 }, { "epoch": 2.0400984875131902, "grad_norm": 1.7463256120681763, "learning_rate": 2.3376221331076033e-06, "loss": 0.3675, "step": 17400 }, { "epoch": 2.041270957908313, "grad_norm": 1.8717834949493408, "learning_rate": 2.332407406187052e-06, "loss": 0.3323, "step": 17410 }, { "epoch": 2.0424434283034354, "grad_norm": 1.6519601345062256, "learning_rate": 2.3271967323145782e-06, "loss": 0.3319, "step": 17420 }, { "epoch": 2.0436158986985578, "grad_norm": 1.5076632499694824, "learning_rate": 2.3219901194071016e-06, "loss": 0.3012, "step": 17430 }, { "epoch": 2.0447883690936806, "grad_norm": 1.2975502014160156, "learning_rate": 2.316787575375369e-06, "loss": 0.3599, "step": 17440 }, { "epoch": 2.045960839488803, "grad_norm": 1.24033522605896, "learning_rate": 2.3115891081239462e-06, "loss": 0.3454, "step": 17450 }, { "epoch": 2.0471333098839253, "grad_norm": 1.3575423955917358, "learning_rate": 2.306394725551205e-06, "loss": 0.3424, "step": 17460 }, { "epoch": 2.048305780279048, "grad_norm": 1.3342974185943604, "learning_rate": 2.301204435549309e-06, "loss": 0.3045, "step": 17470 }, { "epoch": 2.0494782506741704, "grad_norm": 1.5656081438064575, "learning_rate": 2.29601824600421e-06, "loss": 0.3053, "step": 17480 }, { "epoch": 2.050650721069293, "grad_norm": 1.4154105186462402, "learning_rate": 2.2908361647956228e-06, "loss": 0.3046, "step": 17490 }, { "epoch": 2.0518231914644156, "grad_norm": 1.7379752397537231, "learning_rate": 2.2856581997970225e-06, "loss": 0.3479, "step": 17500 }, { "epoch": 2.0518231914644156, "eval_loss": 0.7379317283630371, "eval_model_preparation_time": 0.0, "eval_runtime": 2151.3038, "eval_samples_per_second": 3.524, "eval_steps_per_second": 1.762, "step": 17500 }, { "epoch": 2.052995661859538, "grad_norm": 1.6222236156463623, "learning_rate": 2.2804843588756307e-06, "loss": 0.3126, "step": 17510 }, { "epoch": 2.0541681322546608, "grad_norm": 1.366179347038269, "learning_rate": 2.2753146498924027e-06, "loss": 0.3017, "step": 17520 }, { "epoch": 2.055340602649783, "grad_norm": 1.700607419013977, "learning_rate": 2.270149080702014e-06, "loss": 0.3304, "step": 17530 }, { "epoch": 2.0565130730449055, "grad_norm": 1.7270163297653198, "learning_rate": 2.2649876591528553e-06, "loss": 0.3092, "step": 17540 }, { "epoch": 2.0576855434400283, "grad_norm": 1.260096788406372, "learning_rate": 2.25983039308701e-06, "loss": 0.2975, "step": 17550 }, { "epoch": 2.0588580138351507, "grad_norm": 1.542763113975525, "learning_rate": 2.254677290340252e-06, "loss": 0.3327, "step": 17560 }, { "epoch": 2.060030484230273, "grad_norm": 1.741746187210083, "learning_rate": 2.2495283587420262e-06, "loss": 0.3502, "step": 17570 }, { "epoch": 2.061202954625396, "grad_norm": 1.421644926071167, "learning_rate": 2.244383606115442e-06, "loss": 0.3238, "step": 17580 }, { "epoch": 2.062375425020518, "grad_norm": 1.6083431243896484, "learning_rate": 2.239243040277256e-06, "loss": 0.3319, "step": 17590 }, { "epoch": 2.0635478954156405, "grad_norm": 1.807454228401184, "learning_rate": 2.2341066690378716e-06, "loss": 0.36, "step": 17600 }, { "epoch": 2.0647203658107633, "grad_norm": 1.750653862953186, "learning_rate": 2.2289745002013114e-06, "loss": 0.3164, "step": 17610 }, { "epoch": 2.0658928362058857, "grad_norm": 1.3270869255065918, "learning_rate": 2.223846541565213e-06, "loss": 0.3155, "step": 17620 }, { "epoch": 2.0670653066010085, "grad_norm": 1.59390127658844, "learning_rate": 2.2187228009208266e-06, "loss": 0.34, "step": 17630 }, { "epoch": 2.068237776996131, "grad_norm": 1.5706123113632202, "learning_rate": 2.2136032860529804e-06, "loss": 0.3453, "step": 17640 }, { "epoch": 2.0694102473912532, "grad_norm": 1.769443392753601, "learning_rate": 2.2084880047400893e-06, "loss": 0.3799, "step": 17650 }, { "epoch": 2.070582717786376, "grad_norm": 1.5415226221084595, "learning_rate": 2.2033769647541374e-06, "loss": 0.3279, "step": 17660 }, { "epoch": 2.0717551881814984, "grad_norm": 1.8022586107254028, "learning_rate": 2.198270173860662e-06, "loss": 0.3006, "step": 17670 }, { "epoch": 2.0729276585766208, "grad_norm": 1.4137473106384277, "learning_rate": 2.1931676398187417e-06, "loss": 0.3066, "step": 17680 }, { "epoch": 2.0741001289717436, "grad_norm": 1.552742838859558, "learning_rate": 2.1880693703809958e-06, "loss": 0.3541, "step": 17690 }, { "epoch": 2.075272599366866, "grad_norm": 1.3886044025421143, "learning_rate": 2.182975373293557e-06, "loss": 0.3227, "step": 17700 }, { "epoch": 2.0764450697619887, "grad_norm": 1.2142077684402466, "learning_rate": 2.1778856562960647e-06, "loss": 0.2611, "step": 17710 }, { "epoch": 2.077617540157111, "grad_norm": 1.948805570602417, "learning_rate": 2.1728002271216634e-06, "loss": 0.332, "step": 17720 }, { "epoch": 2.0787900105522334, "grad_norm": 1.4859305620193481, "learning_rate": 2.1677190934969777e-06, "loss": 0.32, "step": 17730 }, { "epoch": 2.0799624809473563, "grad_norm": 1.604649543762207, "learning_rate": 2.1626422631421045e-06, "loss": 0.3607, "step": 17740 }, { "epoch": 2.0811349513424786, "grad_norm": 1.4743728637695312, "learning_rate": 2.1575697437706076e-06, "loss": 0.3285, "step": 17750 }, { "epoch": 2.082307421737601, "grad_norm": 1.4923919439315796, "learning_rate": 2.1525015430894974e-06, "loss": 0.2993, "step": 17760 }, { "epoch": 2.0834798921327238, "grad_norm": 1.4569511413574219, "learning_rate": 2.147437668799221e-06, "loss": 0.3274, "step": 17770 }, { "epoch": 2.084652362527846, "grad_norm": 1.5510083436965942, "learning_rate": 2.1423781285936556e-06, "loss": 0.3009, "step": 17780 }, { "epoch": 2.0858248329229685, "grad_norm": 1.6588085889816284, "learning_rate": 2.1373229301600916e-06, "loss": 0.3221, "step": 17790 }, { "epoch": 2.0869973033180913, "grad_norm": 1.5837905406951904, "learning_rate": 2.1322720811792213e-06, "loss": 0.3215, "step": 17800 }, { "epoch": 2.0881697737132137, "grad_norm": 1.5331858396530151, "learning_rate": 2.1272255893251333e-06, "loss": 0.2976, "step": 17810 }, { "epoch": 2.0893422441083365, "grad_norm": 1.3247042894363403, "learning_rate": 2.1221834622652913e-06, "loss": 0.3279, "step": 17820 }, { "epoch": 2.090514714503459, "grad_norm": 1.4301382303237915, "learning_rate": 2.11714570766053e-06, "loss": 0.3025, "step": 17830 }, { "epoch": 2.091687184898581, "grad_norm": 2.057274580001831, "learning_rate": 2.112112333165039e-06, "loss": 0.3435, "step": 17840 }, { "epoch": 2.092859655293704, "grad_norm": 1.0729871988296509, "learning_rate": 2.1070833464263537e-06, "loss": 0.3263, "step": 17850 }, { "epoch": 2.0940321256888264, "grad_norm": 1.4444668292999268, "learning_rate": 2.1020587550853404e-06, "loss": 0.3326, "step": 17860 }, { "epoch": 2.0952045960839487, "grad_norm": 1.6532708406448364, "learning_rate": 2.097038566776195e-06, "loss": 0.3372, "step": 17870 }, { "epoch": 2.0963770664790715, "grad_norm": 1.2804591655731201, "learning_rate": 2.0920227891264155e-06, "loss": 0.3259, "step": 17880 }, { "epoch": 2.097549536874194, "grad_norm": 1.5900686979293823, "learning_rate": 2.087011429756802e-06, "loss": 0.3108, "step": 17890 }, { "epoch": 2.0987220072693162, "grad_norm": 1.3856329917907715, "learning_rate": 2.0820044962814393e-06, "loss": 0.2975, "step": 17900 }, { "epoch": 2.099894477664439, "grad_norm": 2.0191125869750977, "learning_rate": 2.0770019963076905e-06, "loss": 0.31, "step": 17910 }, { "epoch": 2.1010669480595614, "grad_norm": 2.0470566749572754, "learning_rate": 2.0720039374361796e-06, "loss": 0.3498, "step": 17920 }, { "epoch": 2.102239418454684, "grad_norm": 1.677100419998169, "learning_rate": 2.067010327260787e-06, "loss": 0.3271, "step": 17930 }, { "epoch": 2.1034118888498066, "grad_norm": 1.7902899980545044, "learning_rate": 2.0620211733686317e-06, "loss": 0.3525, "step": 17940 }, { "epoch": 2.104584359244929, "grad_norm": 1.8712518215179443, "learning_rate": 2.0570364833400603e-06, "loss": 0.2868, "step": 17950 }, { "epoch": 2.1057568296400517, "grad_norm": 1.8616176843643188, "learning_rate": 2.05205626474864e-06, "loss": 0.3, "step": 17960 }, { "epoch": 2.106929300035174, "grad_norm": 1.3892697095870972, "learning_rate": 2.0470805251611415e-06, "loss": 0.3652, "step": 17970 }, { "epoch": 2.1081017704302965, "grad_norm": 1.5513066053390503, "learning_rate": 2.0421092721375307e-06, "loss": 0.3197, "step": 17980 }, { "epoch": 2.1092742408254193, "grad_norm": 1.762174367904663, "learning_rate": 2.0371425132309612e-06, "loss": 0.334, "step": 17990 }, { "epoch": 2.1104467112205416, "grad_norm": 1.4834725856781006, "learning_rate": 2.032180255987754e-06, "loss": 0.3022, "step": 18000 }, { "epoch": 2.1104467112205416, "eval_loss": 0.7339808344841003, "eval_model_preparation_time": 0.0, "eval_runtime": 2152.246, "eval_samples_per_second": 3.523, "eval_steps_per_second": 1.761, "step": 18000 }, { "epoch": 2.111619181615664, "grad_norm": 1.4339616298675537, "learning_rate": 2.0272225079473884e-06, "loss": 0.326, "step": 18010 }, { "epoch": 2.112791652010787, "grad_norm": 1.444818139076233, "learning_rate": 2.0222692766425016e-06, "loss": 0.3164, "step": 18020 }, { "epoch": 2.113964122405909, "grad_norm": 1.4939240217208862, "learning_rate": 2.0173205695988566e-06, "loss": 0.2975, "step": 18030 }, { "epoch": 2.115136592801032, "grad_norm": 1.9201432466506958, "learning_rate": 2.0123763943353485e-06, "loss": 0.2733, "step": 18040 }, { "epoch": 2.1163090631961543, "grad_norm": 1.6690709590911865, "learning_rate": 2.0074367583639897e-06, "loss": 0.3565, "step": 18050 }, { "epoch": 2.1174815335912767, "grad_norm": 1.1956110000610352, "learning_rate": 2.0025016691898907e-06, "loss": 0.3438, "step": 18060 }, { "epoch": 2.1186540039863995, "grad_norm": 1.794450283050537, "learning_rate": 1.997571134311254e-06, "loss": 0.3015, "step": 18070 }, { "epoch": 2.119826474381522, "grad_norm": 1.3913649320602417, "learning_rate": 1.9926451612193676e-06, "loss": 0.304, "step": 18080 }, { "epoch": 2.120998944776644, "grad_norm": 1.5987915992736816, "learning_rate": 1.9877237573985846e-06, "loss": 0.3237, "step": 18090 }, { "epoch": 2.122171415171767, "grad_norm": 1.7268571853637695, "learning_rate": 1.9828069303263125e-06, "loss": 0.3156, "step": 18100 }, { "epoch": 2.1233438855668894, "grad_norm": 1.4903814792633057, "learning_rate": 1.977894687473009e-06, "loss": 0.3026, "step": 18110 }, { "epoch": 2.124516355962012, "grad_norm": 1.3586783409118652, "learning_rate": 1.9729870363021697e-06, "loss": 0.3393, "step": 18120 }, { "epoch": 2.1256888263571345, "grad_norm": 1.8614357709884644, "learning_rate": 1.9680839842703083e-06, "loss": 0.303, "step": 18130 }, { "epoch": 2.126861296752257, "grad_norm": 1.5953949689865112, "learning_rate": 1.963185538826951e-06, "loss": 0.3021, "step": 18140 }, { "epoch": 2.1280337671473797, "grad_norm": 1.7195159196853638, "learning_rate": 1.9582917074146307e-06, "loss": 0.3513, "step": 18150 }, { "epoch": 2.129206237542502, "grad_norm": 2.088456153869629, "learning_rate": 1.953402497468866e-06, "loss": 0.3355, "step": 18160 }, { "epoch": 2.1303787079376244, "grad_norm": 1.611238718032837, "learning_rate": 1.9485179164181474e-06, "loss": 0.2865, "step": 18170 }, { "epoch": 2.131551178332747, "grad_norm": 1.3233238458633423, "learning_rate": 1.9436379716839445e-06, "loss": 0.3237, "step": 18180 }, { "epoch": 2.1327236487278696, "grad_norm": 1.3843278884887695, "learning_rate": 1.9387626706806743e-06, "loss": 0.3341, "step": 18190 }, { "epoch": 2.1338961191229924, "grad_norm": 1.6755337715148926, "learning_rate": 1.933892020815701e-06, "loss": 0.3066, "step": 18200 }, { "epoch": 2.1350685895181147, "grad_norm": 1.4012107849121094, "learning_rate": 1.9290260294893237e-06, "loss": 0.3039, "step": 18210 }, { "epoch": 2.136241059913237, "grad_norm": 1.5351438522338867, "learning_rate": 1.92416470409476e-06, "loss": 0.2744, "step": 18220 }, { "epoch": 2.13741353030836, "grad_norm": 1.4251302480697632, "learning_rate": 1.9193080520181405e-06, "loss": 0.2951, "step": 18230 }, { "epoch": 2.1385860007034823, "grad_norm": 1.6227809190750122, "learning_rate": 1.9144560806384943e-06, "loss": 0.3133, "step": 18240 }, { "epoch": 2.1397584710986046, "grad_norm": 1.7891151905059814, "learning_rate": 1.9096087973277394e-06, "loss": 0.3371, "step": 18250 }, { "epoch": 2.1409309414937274, "grad_norm": 1.712722659111023, "learning_rate": 1.9047662094506686e-06, "loss": 0.3146, "step": 18260 }, { "epoch": 2.14210341188885, "grad_norm": 1.362067461013794, "learning_rate": 1.8999283243649463e-06, "loss": 0.3023, "step": 18270 }, { "epoch": 2.143275882283972, "grad_norm": 1.6181689500808716, "learning_rate": 1.8950951494210863e-06, "loss": 0.3231, "step": 18280 }, { "epoch": 2.144448352679095, "grad_norm": 2.047621011734009, "learning_rate": 1.8902666919624479e-06, "loss": 0.3386, "step": 18290 }, { "epoch": 2.1456208230742173, "grad_norm": 1.8959155082702637, "learning_rate": 1.8854429593252222e-06, "loss": 0.3232, "step": 18300 }, { "epoch": 2.14679329346934, "grad_norm": 1.3556405305862427, "learning_rate": 1.8806239588384228e-06, "loss": 0.3081, "step": 18310 }, { "epoch": 2.1479657638644625, "grad_norm": 1.7222957611083984, "learning_rate": 1.8758096978238698e-06, "loss": 0.3535, "step": 18320 }, { "epoch": 2.149138234259585, "grad_norm": 1.4895355701446533, "learning_rate": 1.87100018359619e-06, "loss": 0.3395, "step": 18330 }, { "epoch": 2.1503107046547076, "grad_norm": 1.5366345643997192, "learning_rate": 1.8661954234627905e-06, "loss": 0.3145, "step": 18340 }, { "epoch": 2.15148317504983, "grad_norm": 1.6156436204910278, "learning_rate": 1.8613954247238587e-06, "loss": 0.3502, "step": 18350 }, { "epoch": 2.1526556454449524, "grad_norm": 1.745634913444519, "learning_rate": 1.8566001946723455e-06, "loss": 0.3499, "step": 18360 }, { "epoch": 2.153828115840075, "grad_norm": 1.3691952228546143, "learning_rate": 1.8518097405939594e-06, "loss": 0.3378, "step": 18370 }, { "epoch": 2.1550005862351975, "grad_norm": 1.5402336120605469, "learning_rate": 1.8470240697671489e-06, "loss": 0.326, "step": 18380 }, { "epoch": 2.15617305663032, "grad_norm": 1.8205065727233887, "learning_rate": 1.8422431894630998e-06, "loss": 0.3786, "step": 18390 }, { "epoch": 2.1573455270254427, "grad_norm": 1.3227442502975464, "learning_rate": 1.8374671069457156e-06, "loss": 0.2895, "step": 18400 }, { "epoch": 2.158517997420565, "grad_norm": 1.6865160465240479, "learning_rate": 1.8326958294716106e-06, "loss": 0.3073, "step": 18410 }, { "epoch": 2.159690467815688, "grad_norm": 1.8492602109909058, "learning_rate": 1.8279293642900992e-06, "loss": 0.364, "step": 18420 }, { "epoch": 2.16086293821081, "grad_norm": 1.4391928911209106, "learning_rate": 1.823167718643184e-06, "loss": 0.2868, "step": 18430 }, { "epoch": 2.1620354086059326, "grad_norm": 1.6211191415786743, "learning_rate": 1.8184108997655436e-06, "loss": 0.3367, "step": 18440 }, { "epoch": 2.1632078790010554, "grad_norm": 1.6139549016952515, "learning_rate": 1.8136589148845274e-06, "loss": 0.3492, "step": 18450 }, { "epoch": 2.1643803493961777, "grad_norm": 1.5606896877288818, "learning_rate": 1.8089117712201354e-06, "loss": 0.3051, "step": 18460 }, { "epoch": 2.1655528197913, "grad_norm": 1.538844108581543, "learning_rate": 1.804169475985012e-06, "loss": 0.3006, "step": 18470 }, { "epoch": 2.166725290186423, "grad_norm": 1.6317588090896606, "learning_rate": 1.7994320363844425e-06, "loss": 0.3342, "step": 18480 }, { "epoch": 2.1678977605815453, "grad_norm": 1.824723720550537, "learning_rate": 1.7946994596163236e-06, "loss": 0.3389, "step": 18490 }, { "epoch": 2.1690702309766676, "grad_norm": 1.4743640422821045, "learning_rate": 1.7899717528711692e-06, "loss": 0.2734, "step": 18500 }, { "epoch": 2.1690702309766676, "eval_loss": 0.7362726330757141, "eval_model_preparation_time": 0.0, "eval_runtime": 2150.4339, "eval_samples_per_second": 3.526, "eval_steps_per_second": 1.763, "step": 18500 }, { "epoch": 2.1702427013717904, "grad_norm": 1.5811266899108887, "learning_rate": 1.7852489233320968e-06, "loss": 0.342, "step": 18510 }, { "epoch": 2.171415171766913, "grad_norm": 1.721767783164978, "learning_rate": 1.7805309781748093e-06, "loss": 0.3217, "step": 18520 }, { "epoch": 2.1725876421620356, "grad_norm": 1.5130431652069092, "learning_rate": 1.7758179245675873e-06, "loss": 0.3306, "step": 18530 }, { "epoch": 2.173760112557158, "grad_norm": 1.6994069814682007, "learning_rate": 1.7711097696712849e-06, "loss": 0.3434, "step": 18540 }, { "epoch": 2.1749325829522803, "grad_norm": 1.4206398725509644, "learning_rate": 1.7664065206393111e-06, "loss": 0.3031, "step": 18550 }, { "epoch": 2.176105053347403, "grad_norm": 1.5484789609909058, "learning_rate": 1.7617081846176148e-06, "loss": 0.3333, "step": 18560 }, { "epoch": 2.1772775237425255, "grad_norm": 1.6027915477752686, "learning_rate": 1.7570147687446898e-06, "loss": 0.3159, "step": 18570 }, { "epoch": 2.178449994137648, "grad_norm": 1.588304877281189, "learning_rate": 1.7523262801515494e-06, "loss": 0.3061, "step": 18580 }, { "epoch": 2.1796224645327706, "grad_norm": 1.385800838470459, "learning_rate": 1.747642725961719e-06, "loss": 0.3286, "step": 18590 }, { "epoch": 2.180794934927893, "grad_norm": 1.6358096599578857, "learning_rate": 1.7429641132912322e-06, "loss": 0.318, "step": 18600 }, { "epoch": 2.1819674053230154, "grad_norm": 1.5587682723999023, "learning_rate": 1.7382904492486109e-06, "loss": 0.3309, "step": 18610 }, { "epoch": 2.183139875718138, "grad_norm": 1.6290295124053955, "learning_rate": 1.7336217409348577e-06, "loss": 0.3163, "step": 18620 }, { "epoch": 2.1843123461132605, "grad_norm": 1.440919280052185, "learning_rate": 1.7289579954434477e-06, "loss": 0.3592, "step": 18630 }, { "epoch": 2.1854848165083833, "grad_norm": 2.005171775817871, "learning_rate": 1.7242992198603142e-06, "loss": 0.3291, "step": 18640 }, { "epoch": 2.1866572869035057, "grad_norm": 1.426539421081543, "learning_rate": 1.7196454212638393e-06, "loss": 0.3025, "step": 18650 }, { "epoch": 2.187829757298628, "grad_norm": 1.3897396326065063, "learning_rate": 1.7149966067248458e-06, "loss": 0.3208, "step": 18660 }, { "epoch": 2.189002227693751, "grad_norm": 1.3411085605621338, "learning_rate": 1.710352783306581e-06, "loss": 0.3253, "step": 18670 }, { "epoch": 2.190174698088873, "grad_norm": 1.4695079326629639, "learning_rate": 1.7057139580647098e-06, "loss": 0.3282, "step": 18680 }, { "epoch": 2.1913471684839956, "grad_norm": 1.5728954076766968, "learning_rate": 1.701080138047303e-06, "loss": 0.3386, "step": 18690 }, { "epoch": 2.1925196388791184, "grad_norm": 1.5321180820465088, "learning_rate": 1.6964513302948272e-06, "loss": 0.3361, "step": 18700 }, { "epoch": 2.1936921092742407, "grad_norm": 1.3643341064453125, "learning_rate": 1.6918275418401308e-06, "loss": 0.3289, "step": 18710 }, { "epoch": 2.1948645796693635, "grad_norm": 1.4035167694091797, "learning_rate": 1.6872087797084424e-06, "loss": 0.362, "step": 18720 }, { "epoch": 2.196037050064486, "grad_norm": 2.230631113052368, "learning_rate": 1.6825950509173473e-06, "loss": 0.3383, "step": 18730 }, { "epoch": 2.1972095204596083, "grad_norm": 1.7343336343765259, "learning_rate": 1.677986362476786e-06, "loss": 0.3559, "step": 18740 }, { "epoch": 2.198381990854731, "grad_norm": 1.8525903224945068, "learning_rate": 1.6733827213890408e-06, "loss": 0.3248, "step": 18750 }, { "epoch": 2.1995544612498534, "grad_norm": 1.614118218421936, "learning_rate": 1.6687841346487244e-06, "loss": 0.3197, "step": 18760 }, { "epoch": 2.200726931644976, "grad_norm": 1.855404257774353, "learning_rate": 1.6641906092427713e-06, "loss": 0.3229, "step": 18770 }, { "epoch": 2.2018994020400986, "grad_norm": 1.8113679885864258, "learning_rate": 1.6596021521504235e-06, "loss": 0.3634, "step": 18780 }, { "epoch": 2.203071872435221, "grad_norm": 1.3969372510910034, "learning_rate": 1.6550187703432274e-06, "loss": 0.3346, "step": 18790 }, { "epoch": 2.2042443428303438, "grad_norm": 1.67252516746521, "learning_rate": 1.6504404707850142e-06, "loss": 0.3122, "step": 18800 }, { "epoch": 2.205416813225466, "grad_norm": 1.5685174465179443, "learning_rate": 1.6458672604318932e-06, "loss": 0.2917, "step": 18810 }, { "epoch": 2.2065892836205885, "grad_norm": 1.6172845363616943, "learning_rate": 1.6412991462322426e-06, "loss": 0.3001, "step": 18820 }, { "epoch": 2.2077617540157113, "grad_norm": 1.266152024269104, "learning_rate": 1.6367361351266975e-06, "loss": 0.308, "step": 18830 }, { "epoch": 2.2089342244108336, "grad_norm": 1.4798026084899902, "learning_rate": 1.632178234048138e-06, "loss": 0.3323, "step": 18840 }, { "epoch": 2.210106694805956, "grad_norm": 1.558036208152771, "learning_rate": 1.6276254499216843e-06, "loss": 0.3487, "step": 18850 }, { "epoch": 2.211279165201079, "grad_norm": 2.224095106124878, "learning_rate": 1.6230777896646782e-06, "loss": 0.3368, "step": 18860 }, { "epoch": 2.212451635596201, "grad_norm": 1.3588589429855347, "learning_rate": 1.6185352601866776e-06, "loss": 0.3028, "step": 18870 }, { "epoch": 2.2136241059913235, "grad_norm": 1.4374957084655762, "learning_rate": 1.6139978683894452e-06, "loss": 0.3138, "step": 18880 }, { "epoch": 2.2147965763864463, "grad_norm": 1.838011622428894, "learning_rate": 1.6094656211669368e-06, "loss": 0.3336, "step": 18890 }, { "epoch": 2.2159690467815687, "grad_norm": 1.8039276599884033, "learning_rate": 1.6049385254052913e-06, "loss": 0.353, "step": 18900 }, { "epoch": 2.2171415171766915, "grad_norm": 1.3478928804397583, "learning_rate": 1.600416587982825e-06, "loss": 0.3226, "step": 18910 }, { "epoch": 2.218313987571814, "grad_norm": 1.2816658020019531, "learning_rate": 1.5958998157700107e-06, "loss": 0.2941, "step": 18920 }, { "epoch": 2.219486457966936, "grad_norm": 1.4664472341537476, "learning_rate": 1.5913882156294764e-06, "loss": 0.3255, "step": 18930 }, { "epoch": 2.220658928362059, "grad_norm": 1.429824948310852, "learning_rate": 1.5868817944159948e-06, "loss": 0.3111, "step": 18940 }, { "epoch": 2.2218313987571814, "grad_norm": 1.6697758436203003, "learning_rate": 1.5823805589764624e-06, "loss": 0.3224, "step": 18950 }, { "epoch": 2.2230038691523037, "grad_norm": 1.427608847618103, "learning_rate": 1.5778845161499006e-06, "loss": 0.3146, "step": 18960 }, { "epoch": 2.2241763395474266, "grad_norm": 1.7440592050552368, "learning_rate": 1.573393672767446e-06, "loss": 0.3129, "step": 18970 }, { "epoch": 2.225348809942549, "grad_norm": 1.7817233800888062, "learning_rate": 1.5689080356523284e-06, "loss": 0.3665, "step": 18980 }, { "epoch": 2.2265212803376713, "grad_norm": 1.8191629648208618, "learning_rate": 1.5644276116198681e-06, "loss": 0.33, "step": 18990 }, { "epoch": 2.227693750732794, "grad_norm": 1.5648077726364136, "learning_rate": 1.5599524074774707e-06, "loss": 0.3382, "step": 19000 }, { "epoch": 2.227693750732794, "eval_loss": 0.7365580201148987, "eval_model_preparation_time": 0.0, "eval_runtime": 2152.6432, "eval_samples_per_second": 3.522, "eval_steps_per_second": 1.761, "step": 19000 }, { "epoch": 2.2288662211279164, "grad_norm": 1.769892930984497, "learning_rate": 1.5554824300246063e-06, "loss": 0.3228, "step": 19010 }, { "epoch": 2.2300386915230392, "grad_norm": 1.338385820388794, "learning_rate": 1.5510176860527992e-06, "loss": 0.3189, "step": 19020 }, { "epoch": 2.2312111619181616, "grad_norm": 1.859786033630371, "learning_rate": 1.5465581823456334e-06, "loss": 0.3042, "step": 19030 }, { "epoch": 2.232383632313284, "grad_norm": 1.6977640390396118, "learning_rate": 1.5421039256787213e-06, "loss": 0.3047, "step": 19040 }, { "epoch": 2.2335561027084068, "grad_norm": 1.7828209400177002, "learning_rate": 1.5376549228197069e-06, "loss": 0.3067, "step": 19050 }, { "epoch": 2.234728573103529, "grad_norm": 1.978924036026001, "learning_rate": 1.5332111805282535e-06, "loss": 0.3127, "step": 19060 }, { "epoch": 2.2359010434986515, "grad_norm": 1.6305441856384277, "learning_rate": 1.5287727055560287e-06, "loss": 0.3276, "step": 19070 }, { "epoch": 2.2370735138937743, "grad_norm": 1.4147400856018066, "learning_rate": 1.524339504646698e-06, "loss": 0.2929, "step": 19080 }, { "epoch": 2.2382459842888967, "grad_norm": 1.6729449033737183, "learning_rate": 1.5199115845359137e-06, "loss": 0.2943, "step": 19090 }, { "epoch": 2.239418454684019, "grad_norm": 1.7346460819244385, "learning_rate": 1.5154889519513044e-06, "loss": 0.3392, "step": 19100 }, { "epoch": 2.240590925079142, "grad_norm": 1.3805817365646362, "learning_rate": 1.5110716136124642e-06, "loss": 0.3118, "step": 19110 }, { "epoch": 2.241763395474264, "grad_norm": 1.667726755142212, "learning_rate": 1.5066595762309478e-06, "loss": 0.3359, "step": 19120 }, { "epoch": 2.242935865869387, "grad_norm": 1.4846937656402588, "learning_rate": 1.5022528465102504e-06, "loss": 0.3259, "step": 19130 }, { "epoch": 2.2441083362645093, "grad_norm": 1.660733699798584, "learning_rate": 1.4978514311458053e-06, "loss": 0.3191, "step": 19140 }, { "epoch": 2.2452808066596317, "grad_norm": 1.305396318435669, "learning_rate": 1.493455336824971e-06, "loss": 0.3222, "step": 19150 }, { "epoch": 2.2464532770547545, "grad_norm": 1.2296866178512573, "learning_rate": 1.4890645702270212e-06, "loss": 0.3057, "step": 19160 }, { "epoch": 2.247625747449877, "grad_norm": 1.3618738651275635, "learning_rate": 1.4846791380231334e-06, "loss": 0.3072, "step": 19170 }, { "epoch": 2.2487982178449992, "grad_norm": 1.5501824617385864, "learning_rate": 1.4802990468763856e-06, "loss": 0.3087, "step": 19180 }, { "epoch": 2.249970688240122, "grad_norm": 1.4805817604064941, "learning_rate": 1.4759243034417354e-06, "loss": 0.2817, "step": 19190 }, { "epoch": 2.2511431586352444, "grad_norm": 1.5453556776046753, "learning_rate": 1.4715549143660158e-06, "loss": 0.3215, "step": 19200 }, { "epoch": 2.2523156290303668, "grad_norm": 1.5343269109725952, "learning_rate": 1.467190886287927e-06, "loss": 0.2932, "step": 19210 }, { "epoch": 2.2534880994254896, "grad_norm": 1.2479933500289917, "learning_rate": 1.4628322258380217e-06, "loss": 0.2837, "step": 19220 }, { "epoch": 2.254660569820612, "grad_norm": 1.193922996520996, "learning_rate": 1.4584789396386972e-06, "loss": 0.2631, "step": 19230 }, { "epoch": 2.2558330402157347, "grad_norm": 1.3377540111541748, "learning_rate": 1.4541310343041891e-06, "loss": 0.2975, "step": 19240 }, { "epoch": 2.257005510610857, "grad_norm": 1.375259518623352, "learning_rate": 1.4497885164405534e-06, "loss": 0.3198, "step": 19250 }, { "epoch": 2.2581779810059794, "grad_norm": 1.197511911392212, "learning_rate": 1.4454513926456609e-06, "loss": 0.3182, "step": 19260 }, { "epoch": 2.2593504514011022, "grad_norm": 1.6332027912139893, "learning_rate": 1.441119669509189e-06, "loss": 0.3228, "step": 19270 }, { "epoch": 2.2605229217962246, "grad_norm": 1.6748228073120117, "learning_rate": 1.4367933536126067e-06, "loss": 0.3344, "step": 19280 }, { "epoch": 2.2616953921913474, "grad_norm": 1.6451077461242676, "learning_rate": 1.4324724515291688e-06, "loss": 0.3419, "step": 19290 }, { "epoch": 2.2628678625864698, "grad_norm": 2.0020792484283447, "learning_rate": 1.4281569698239073e-06, "loss": 0.3141, "step": 19300 }, { "epoch": 2.264040332981592, "grad_norm": 1.7223883867263794, "learning_rate": 1.4238469150536143e-06, "loss": 0.3215, "step": 19310 }, { "epoch": 2.2652128033767145, "grad_norm": 1.9799431562423706, "learning_rate": 1.4195422937668368e-06, "loss": 0.33, "step": 19320 }, { "epoch": 2.2663852737718373, "grad_norm": 1.534840703010559, "learning_rate": 1.415243112503873e-06, "loss": 0.3018, "step": 19330 }, { "epoch": 2.2675577441669597, "grad_norm": 1.610965371131897, "learning_rate": 1.4109493777967454e-06, "loss": 0.3444, "step": 19340 }, { "epoch": 2.2687302145620825, "grad_norm": 1.756552815437317, "learning_rate": 1.4066610961692056e-06, "loss": 0.3162, "step": 19350 }, { "epoch": 2.269902684957205, "grad_norm": 1.7546595335006714, "learning_rate": 1.4023782741367248e-06, "loss": 0.3302, "step": 19360 }, { "epoch": 2.271075155352327, "grad_norm": 1.724847435951233, "learning_rate": 1.3981009182064731e-06, "loss": 0.3375, "step": 19370 }, { "epoch": 2.27224762574745, "grad_norm": 1.8000494241714478, "learning_rate": 1.3938290348773165e-06, "loss": 0.295, "step": 19380 }, { "epoch": 2.2734200961425723, "grad_norm": 1.6342540979385376, "learning_rate": 1.389562630639807e-06, "loss": 0.3066, "step": 19390 }, { "epoch": 2.274592566537695, "grad_norm": 1.3576416969299316, "learning_rate": 1.3853017119761753e-06, "loss": 0.3324, "step": 19400 }, { "epoch": 2.2757650369328175, "grad_norm": 1.7177555561065674, "learning_rate": 1.3810462853603097e-06, "loss": 0.3368, "step": 19410 }, { "epoch": 2.27693750732794, "grad_norm": 1.6503722667694092, "learning_rate": 1.3767963572577586e-06, "loss": 0.3355, "step": 19420 }, { "epoch": 2.2781099777230627, "grad_norm": 1.6473678350448608, "learning_rate": 1.3725519341257193e-06, "loss": 0.2881, "step": 19430 }, { "epoch": 2.279282448118185, "grad_norm": 1.295916199684143, "learning_rate": 1.3683130224130204e-06, "loss": 0.2625, "step": 19440 }, { "epoch": 2.2804549185133074, "grad_norm": 1.8132423162460327, "learning_rate": 1.3640796285601154e-06, "loss": 0.306, "step": 19450 }, { "epoch": 2.28162738890843, "grad_norm": 1.7239766120910645, "learning_rate": 1.359851758999081e-06, "loss": 0.3444, "step": 19460 }, { "epoch": 2.2827998593035526, "grad_norm": 1.9465758800506592, "learning_rate": 1.3556294201535959e-06, "loss": 0.3643, "step": 19470 }, { "epoch": 2.283972329698675, "grad_norm": 1.5899369716644287, "learning_rate": 1.351412618438931e-06, "loss": 0.2939, "step": 19480 }, { "epoch": 2.2851448000937977, "grad_norm": 1.4667377471923828, "learning_rate": 1.3472013602619539e-06, "loss": 0.321, "step": 19490 }, { "epoch": 2.28631727048892, "grad_norm": 2.047821521759033, "learning_rate": 1.3429956520211034e-06, "loss": 0.3331, "step": 19500 }, { "epoch": 2.28631727048892, "eval_loss": 0.7384545803070068, "eval_model_preparation_time": 0.0, "eval_runtime": 2153.1888, "eval_samples_per_second": 3.521, "eval_steps_per_second": 1.761, "step": 19500 }, { "epoch": 2.287489740884043, "grad_norm": 1.679823875427246, "learning_rate": 1.3387955001063863e-06, "loss": 0.3339, "step": 19510 }, { "epoch": 2.2886622112791652, "grad_norm": 2.0139665603637695, "learning_rate": 1.3346009108993707e-06, "loss": 0.3244, "step": 19520 }, { "epoch": 2.2898346816742876, "grad_norm": 1.5202466249465942, "learning_rate": 1.3304118907731695e-06, "loss": 0.3069, "step": 19530 }, { "epoch": 2.2910071520694104, "grad_norm": 1.7452152967453003, "learning_rate": 1.3262284460924352e-06, "loss": 0.3269, "step": 19540 }, { "epoch": 2.2921796224645328, "grad_norm": 1.560340404510498, "learning_rate": 1.322050583213349e-06, "loss": 0.3127, "step": 19550 }, { "epoch": 2.293352092859655, "grad_norm": 1.8043817281723022, "learning_rate": 1.3178783084836116e-06, "loss": 0.3339, "step": 19560 }, { "epoch": 2.294524563254778, "grad_norm": 1.5446946620941162, "learning_rate": 1.3137116282424323e-06, "loss": 0.3115, "step": 19570 }, { "epoch": 2.2956970336499003, "grad_norm": 1.389460563659668, "learning_rate": 1.3095505488205228e-06, "loss": 0.3284, "step": 19580 }, { "epoch": 2.2968695040450227, "grad_norm": 1.8152519464492798, "learning_rate": 1.3053950765400836e-06, "loss": 0.3101, "step": 19590 }, { "epoch": 2.2980419744401455, "grad_norm": 1.5503618717193604, "learning_rate": 1.3012452177147943e-06, "loss": 0.3022, "step": 19600 }, { "epoch": 2.299214444835268, "grad_norm": 1.3729435205459595, "learning_rate": 1.2971009786498085e-06, "loss": 0.3175, "step": 19610 }, { "epoch": 2.3003869152303906, "grad_norm": 1.7879630327224731, "learning_rate": 1.292962365641739e-06, "loss": 0.3478, "step": 19620 }, { "epoch": 2.301559385625513, "grad_norm": 1.6988385915756226, "learning_rate": 1.2888293849786503e-06, "loss": 0.308, "step": 19630 }, { "epoch": 2.3027318560206353, "grad_norm": 2.0670838356018066, "learning_rate": 1.2847020429400536e-06, "loss": 0.3323, "step": 19640 }, { "epoch": 2.303904326415758, "grad_norm": 1.8189367055892944, "learning_rate": 1.280580345796888e-06, "loss": 0.3314, "step": 19650 }, { "epoch": 2.3050767968108805, "grad_norm": 1.9199049472808838, "learning_rate": 1.2764642998115179e-06, "loss": 0.3182, "step": 19660 }, { "epoch": 2.306249267206003, "grad_norm": 1.5913752317428589, "learning_rate": 1.272353911237722e-06, "loss": 0.3335, "step": 19670 }, { "epoch": 2.3074217376011257, "grad_norm": 1.2707297801971436, "learning_rate": 1.2682491863206825e-06, "loss": 0.2953, "step": 19680 }, { "epoch": 2.308594207996248, "grad_norm": 1.9378114938735962, "learning_rate": 1.264150131296975e-06, "loss": 0.3047, "step": 19690 }, { "epoch": 2.3097666783913704, "grad_norm": 1.617783784866333, "learning_rate": 1.2600567523945651e-06, "loss": 0.3137, "step": 19700 }, { "epoch": 2.310939148786493, "grad_norm": 1.4105325937271118, "learning_rate": 1.2559690558327903e-06, "loss": 0.3111, "step": 19710 }, { "epoch": 2.3121116191816156, "grad_norm": 1.414950966835022, "learning_rate": 1.2518870478223554e-06, "loss": 0.2981, "step": 19720 }, { "epoch": 2.3132840895767384, "grad_norm": 1.704995036125183, "learning_rate": 1.2478107345653224e-06, "loss": 0.3134, "step": 19730 }, { "epoch": 2.3144565599718607, "grad_norm": 1.3766077756881714, "learning_rate": 1.2437401222551004e-06, "loss": 0.3259, "step": 19740 }, { "epoch": 2.315629030366983, "grad_norm": 1.75359046459198, "learning_rate": 1.2396752170764364e-06, "loss": 0.2785, "step": 19750 }, { "epoch": 2.316801500762106, "grad_norm": 1.6450438499450684, "learning_rate": 1.23561602520541e-06, "loss": 0.3117, "step": 19760 }, { "epoch": 2.3179739711572283, "grad_norm": 1.5380266904830933, "learning_rate": 1.2315625528094155e-06, "loss": 0.2968, "step": 19770 }, { "epoch": 2.3191464415523506, "grad_norm": 1.8617204427719116, "learning_rate": 1.227514806047157e-06, "loss": 0.3169, "step": 19780 }, { "epoch": 2.3203189119474734, "grad_norm": 1.6534156799316406, "learning_rate": 1.2234727910686462e-06, "loss": 0.3279, "step": 19790 }, { "epoch": 2.3214913823425958, "grad_norm": 1.5101407766342163, "learning_rate": 1.2194365140151765e-06, "loss": 0.3318, "step": 19800 }, { "epoch": 2.322663852737718, "grad_norm": 1.8775062561035156, "learning_rate": 1.2154059810193274e-06, "loss": 0.3378, "step": 19810 }, { "epoch": 2.323836323132841, "grad_norm": 1.5260698795318604, "learning_rate": 1.211381198204955e-06, "loss": 0.291, "step": 19820 }, { "epoch": 2.3250087935279633, "grad_norm": 1.5114983320236206, "learning_rate": 1.207362171687174e-06, "loss": 0.2907, "step": 19830 }, { "epoch": 2.326181263923086, "grad_norm": 1.5104436874389648, "learning_rate": 1.2033489075723531e-06, "loss": 0.3093, "step": 19840 }, { "epoch": 2.3273537343182085, "grad_norm": 1.5890626907348633, "learning_rate": 1.1993414119581093e-06, "loss": 0.3289, "step": 19850 }, { "epoch": 2.328526204713331, "grad_norm": 1.5569175481796265, "learning_rate": 1.1953396909332944e-06, "loss": 0.31, "step": 19860 }, { "epoch": 2.3296986751084536, "grad_norm": 1.5697401762008667, "learning_rate": 1.191343750577979e-06, "loss": 0.33, "step": 19870 }, { "epoch": 2.330871145503576, "grad_norm": 1.7405133247375488, "learning_rate": 1.187353596963463e-06, "loss": 0.3275, "step": 19880 }, { "epoch": 2.332043615898699, "grad_norm": 1.8177491426467896, "learning_rate": 1.1833692361522459e-06, "loss": 0.3254, "step": 19890 }, { "epoch": 2.333216086293821, "grad_norm": 1.6528843641281128, "learning_rate": 1.179390674198026e-06, "loss": 0.2977, "step": 19900 }, { "epoch": 2.3343885566889435, "grad_norm": 1.4462692737579346, "learning_rate": 1.1754179171456963e-06, "loss": 0.3041, "step": 19910 }, { "epoch": 2.3355610270840663, "grad_norm": 1.894621729850769, "learning_rate": 1.1714509710313254e-06, "loss": 0.3626, "step": 19920 }, { "epoch": 2.3367334974791887, "grad_norm": 1.6731274127960205, "learning_rate": 1.1674898418821545e-06, "loss": 0.2995, "step": 19930 }, { "epoch": 2.337905967874311, "grad_norm": 1.743500828742981, "learning_rate": 1.1635345357165856e-06, "loss": 0.311, "step": 19940 }, { "epoch": 2.339078438269434, "grad_norm": 1.6634130477905273, "learning_rate": 1.1595850585441747e-06, "loss": 0.3173, "step": 19950 }, { "epoch": 2.340250908664556, "grad_norm": 2.974357843399048, "learning_rate": 1.1556414163656204e-06, "loss": 0.3149, "step": 19960 }, { "epoch": 2.3414233790596786, "grad_norm": 1.3696047067642212, "learning_rate": 1.1517036151727579e-06, "loss": 0.3487, "step": 19970 }, { "epoch": 2.3425958494548014, "grad_norm": 1.2937785387039185, "learning_rate": 1.1477716609485462e-06, "loss": 0.3252, "step": 19980 }, { "epoch": 2.3437683198499237, "grad_norm": 1.6861169338226318, "learning_rate": 1.1438455596670594e-06, "loss": 0.3245, "step": 19990 }, { "epoch": 2.3449407902450465, "grad_norm": 1.5139819383621216, "learning_rate": 1.1399253172934816e-06, "loss": 0.314, "step": 20000 }, { "epoch": 2.3449407902450465, "eval_loss": 0.7351424098014832, "eval_model_preparation_time": 0.0, "eval_runtime": 2153.3893, "eval_samples_per_second": 3.521, "eval_steps_per_second": 1.76, "step": 20000 }, { "epoch": 2.346113260640169, "grad_norm": 1.975669264793396, "learning_rate": 1.1360109397840935e-06, "loss": 0.2972, "step": 20010 }, { "epoch": 2.3472857310352913, "grad_norm": 1.5378589630126953, "learning_rate": 1.132102433086264e-06, "loss": 0.2867, "step": 20020 }, { "epoch": 2.348458201430414, "grad_norm": 1.3990730047225952, "learning_rate": 1.1281998031384434e-06, "loss": 0.3079, "step": 20030 }, { "epoch": 2.3496306718255364, "grad_norm": 2.0945851802825928, "learning_rate": 1.124303055870154e-06, "loss": 0.309, "step": 20040 }, { "epoch": 2.350803142220659, "grad_norm": 2.1660616397857666, "learning_rate": 1.1204121972019794e-06, "loss": 0.3215, "step": 20050 }, { "epoch": 2.3519756126157816, "grad_norm": 1.597339391708374, "learning_rate": 1.1165272330455545e-06, "loss": 0.3274, "step": 20060 }, { "epoch": 2.353148083010904, "grad_norm": 1.75774085521698, "learning_rate": 1.1126481693035606e-06, "loss": 0.3211, "step": 20070 }, { "epoch": 2.3543205534060263, "grad_norm": 1.6795837879180908, "learning_rate": 1.1087750118697126e-06, "loss": 0.3346, "step": 20080 }, { "epoch": 2.355493023801149, "grad_norm": 1.6776065826416016, "learning_rate": 1.104907766628751e-06, "loss": 0.3095, "step": 20090 }, { "epoch": 2.3566654941962715, "grad_norm": 1.455680251121521, "learning_rate": 1.101046439456438e-06, "loss": 0.306, "step": 20100 }, { "epoch": 2.3578379645913943, "grad_norm": 1.7484855651855469, "learning_rate": 1.0971910362195382e-06, "loss": 0.3011, "step": 20110 }, { "epoch": 2.3590104349865166, "grad_norm": 1.4823938608169556, "learning_rate": 1.0933415627758193e-06, "loss": 0.3285, "step": 20120 }, { "epoch": 2.360182905381639, "grad_norm": 1.4947311878204346, "learning_rate": 1.0894980249740378e-06, "loss": 0.3377, "step": 20130 }, { "epoch": 2.361355375776762, "grad_norm": 1.7389792203903198, "learning_rate": 1.0856604286539324e-06, "loss": 0.3451, "step": 20140 }, { "epoch": 2.362527846171884, "grad_norm": 1.1980576515197754, "learning_rate": 1.081828779646213e-06, "loss": 0.3034, "step": 20150 }, { "epoch": 2.3637003165670065, "grad_norm": 1.6849286556243896, "learning_rate": 1.078003083772558e-06, "loss": 0.3264, "step": 20160 }, { "epoch": 2.3648727869621293, "grad_norm": 1.8348019123077393, "learning_rate": 1.0741833468455953e-06, "loss": 0.33, "step": 20170 }, { "epoch": 2.3660452573572517, "grad_norm": 1.4360511302947998, "learning_rate": 1.0703695746689025e-06, "loss": 0.3582, "step": 20180 }, { "epoch": 2.367217727752374, "grad_norm": 1.5573744773864746, "learning_rate": 1.066561773036992e-06, "loss": 0.2949, "step": 20190 }, { "epoch": 2.368390198147497, "grad_norm": 1.751281976699829, "learning_rate": 1.0627599477353073e-06, "loss": 0.3009, "step": 20200 }, { "epoch": 2.369562668542619, "grad_norm": 1.9083530902862549, "learning_rate": 1.0589641045402088e-06, "loss": 0.3287, "step": 20210 }, { "epoch": 2.370735138937742, "grad_norm": 1.477790355682373, "learning_rate": 1.0551742492189722e-06, "loss": 0.3094, "step": 20220 }, { "epoch": 2.3719076093328644, "grad_norm": 1.7650055885314941, "learning_rate": 1.051390387529771e-06, "loss": 0.3197, "step": 20230 }, { "epoch": 2.3730800797279867, "grad_norm": 1.7915992736816406, "learning_rate": 1.047612525221673e-06, "loss": 0.3373, "step": 20240 }, { "epoch": 2.3742525501231095, "grad_norm": 1.4987279176712036, "learning_rate": 1.0438406680346364e-06, "loss": 0.3193, "step": 20250 }, { "epoch": 2.375425020518232, "grad_norm": 1.44124436378479, "learning_rate": 1.040074821699486e-06, "loss": 0.2939, "step": 20260 }, { "epoch": 2.3765974909133543, "grad_norm": 1.357345461845398, "learning_rate": 1.036314991937919e-06, "loss": 0.3267, "step": 20270 }, { "epoch": 2.377769961308477, "grad_norm": 1.8084501028060913, "learning_rate": 1.0325611844624934e-06, "loss": 0.318, "step": 20280 }, { "epoch": 2.3789424317035994, "grad_norm": 1.4971776008605957, "learning_rate": 1.028813404976613e-06, "loss": 0.2889, "step": 20290 }, { "epoch": 2.380114902098722, "grad_norm": 1.6454535722732544, "learning_rate": 1.025071659174524e-06, "loss": 0.3202, "step": 20300 }, { "epoch": 2.3812873724938446, "grad_norm": 2.0665669441223145, "learning_rate": 1.0213359527413087e-06, "loss": 0.3523, "step": 20310 }, { "epoch": 2.382459842888967, "grad_norm": 1.3467364311218262, "learning_rate": 1.017606291352869e-06, "loss": 0.3274, "step": 20320 }, { "epoch": 2.3836323132840898, "grad_norm": 1.5754767656326294, "learning_rate": 1.0138826806759206e-06, "loss": 0.3425, "step": 20330 }, { "epoch": 2.384804783679212, "grad_norm": 1.788784146308899, "learning_rate": 1.0101651263679928e-06, "loss": 0.3324, "step": 20340 }, { "epoch": 2.3859772540743345, "grad_norm": 1.783046841621399, "learning_rate": 1.0064536340774084e-06, "loss": 0.3025, "step": 20350 }, { "epoch": 2.3871497244694573, "grad_norm": 1.787916898727417, "learning_rate": 1.0027482094432788e-06, "loss": 0.2935, "step": 20360 }, { "epoch": 2.3883221948645796, "grad_norm": 1.785980463027954, "learning_rate": 9.99048858095501e-07, "loss": 0.3221, "step": 20370 }, { "epoch": 2.389494665259702, "grad_norm": 1.5774880647659302, "learning_rate": 9.953555856547403e-07, "loss": 0.3164, "step": 20380 }, { "epoch": 2.390667135654825, "grad_norm": 1.413008689880371, "learning_rate": 9.91668397732427e-07, "loss": 0.3485, "step": 20390 }, { "epoch": 2.391839606049947, "grad_norm": 1.5916597843170166, "learning_rate": 9.879872999307483e-07, "loss": 0.3085, "step": 20400 }, { "epoch": 2.3930120764450695, "grad_norm": 1.9006156921386719, "learning_rate": 9.843122978426357e-07, "loss": 0.3065, "step": 20410 }, { "epoch": 2.3941845468401923, "grad_norm": 1.4076749086380005, "learning_rate": 9.806433970517598e-07, "loss": 0.3453, "step": 20420 }, { "epoch": 2.3953570172353147, "grad_norm": 1.4751192331314087, "learning_rate": 9.769806031325246e-07, "loss": 0.3093, "step": 20430 }, { "epoch": 2.3965294876304375, "grad_norm": 1.6262187957763672, "learning_rate": 9.733239216500511e-07, "loss": 0.3134, "step": 20440 }, { "epoch": 2.39770195802556, "grad_norm": 1.7781968116760254, "learning_rate": 9.696733581601753e-07, "loss": 0.2975, "step": 20450 }, { "epoch": 2.398874428420682, "grad_norm": 1.2695683240890503, "learning_rate": 9.660289182094363e-07, "loss": 0.3312, "step": 20460 }, { "epoch": 2.400046898815805, "grad_norm": 1.7483234405517578, "learning_rate": 9.623906073350714e-07, "loss": 0.3019, "step": 20470 }, { "epoch": 2.4012193692109274, "grad_norm": 1.5114021301269531, "learning_rate": 9.587584310650027e-07, "loss": 0.3015, "step": 20480 }, { "epoch": 2.40239183960605, "grad_norm": 1.6820963621139526, "learning_rate": 9.551323949178366e-07, "loss": 0.3426, "step": 20490 }, { "epoch": 2.4035643100011725, "grad_norm": 1.6648640632629395, "learning_rate": 9.515125044028445e-07, "loss": 0.3228, "step": 20500 }, { "epoch": 2.4035643100011725, "eval_loss": 0.7376635074615479, "eval_model_preparation_time": 0.0, "eval_runtime": 2157.2278, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.757, "step": 20500 } ], "logging_steps": 10, "max_steps": 25587, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.993237491193479e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }